From 58b9ca1e6f7768b23e752dabc30468c06d0e1c57 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Mon, 10 Feb 2020 16:23:44 +0900
Subject: [PATCH 001/185] [SPARK-30592][SQL][FOLLOWUP] Add some round-trip test
 cases

### What changes were proposed in this pull request?

Add round-trip tests for CSV and JSON functions as  https://github.com/apache/spark/pull/27317#discussion_r376745135 asked.

### Why are the changes needed?

improve test coverage

### Does this PR introduce any user-facing change?

no
### How was this patch tested?

add uts

Closes #27510 from yaooqinn/SPARK-30592-F.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../resources/sql-tests/inputs/interval.sql   | 14 +++++--
 .../sql-tests/results/ansi/interval.sql.out   | 38 ++++++++-----------
 .../sql-tests/results/interval.sql.out        | 38 ++++++++-----------
 3 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
index fb6c485f619ae..a4e621e9639d4 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
@@ -222,7 +222,13 @@ select a * 1.1 from values (interval '-2147483648 months', interval '2147483647
 select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b);
 
 -- interval support for csv and json functions
-SELECT from_csv('1, 1 day', 'a INT, b interval');
-SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute));
-SELECT from_json('{"a":"1 days"}', 'a interval');
-SELECT to_json(map('a', interval 25 month 100 day 130 minute));
+SELECT
+  from_csv('1, 1 day', 'a INT, b interval'),
+  to_csv(from_csv('1, 1 day', 'a INT, b interval')),
+  to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)),
+  from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval');
+SELECT
+  from_json('{"a":"1 days"}', 'a interval'),
+  to_json(from_json('{"a":"1 days"}', 'a interval')),
+  to_json(map('a', interval 25 month 100 day 130 minute)),
+  from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval');
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index f37049064d989..7fdb4c53d1dcb 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 101
+-- Number of queries: 99
 
 
 -- !query
@@ -988,32 +988,24 @@ integer overflow
 
 
 -- !query
-SELECT from_csv('1, 1 day', 'a INT, b interval')
--- !query schema
-struct<from_csv(1, 1 day):struct<a:int,b:interval>>
--- !query output
-{"a":1,"b":1 days}
-
-
--- !query
-SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute))
--- !query schema
-struct<to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string>
--- !query output
-2 years 8 months,1 hours 10 minutes
-
-
--- !query
-SELECT from_json('{"a":"1 days"}', 'a interval')
+SELECT
+  from_csv('1, 1 day', 'a INT, b interval'),
+  to_csv(from_csv('1, 1 day', 'a INT, b interval')),
+  to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)),
+  from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval')
 -- !query schema
-struct<from_json({"a":"1 days"}):struct<a:interval>>
+struct<from_csv(1, 1 day):struct<a:int,b:interval>,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct<a:interval,b:interval>>
 -- !query output
-{"a":1 days}
+{"a":1,"b":1 days}	1,1 days	2 years 8 months,1 hours 10 minutes	{"a":2 years 8 months,"b":1 hours 10 minutes}
 
 
 -- !query
-SELECT to_json(map('a', interval 25 month 100 day 130 minute))
+SELECT
+  from_json('{"a":"1 days"}', 'a interval'),
+  to_json(from_json('{"a":"1 days"}', 'a interval')),
+  to_json(map('a', interval 25 month 100 day 130 minute)),
+  from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval')
 -- !query schema
-struct<to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string>
+struct<from_json({"a":"1 days"}):struct<a:interval>,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct<a:interval>>
 -- !query output
-{"a":"2 years 1 months 100 days 2 hours 10 minutes"}
+{"a":1 days}	{"a":"1 days"}	{"a":"2 years 1 months 100 days 2 hours 10 minutes"}	{"a":2 years 1 months 100 days 2 hours 10 minutes}
diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
index 94b4f15815ca5..3c4b4301d0025 100644
--- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 101
+-- Number of queries: 99
 
 
 -- !query
@@ -969,32 +969,24 @@ integer overflow
 
 
 -- !query
-SELECT from_csv('1, 1 day', 'a INT, b interval')
--- !query schema
-struct<from_csv(1, 1 day):struct<a:int,b:interval>>
--- !query output
-{"a":1,"b":1 days}
-
-
--- !query
-SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute))
--- !query schema
-struct<to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string>
--- !query output
-2 years 8 months,1 hours 10 minutes
-
-
--- !query
-SELECT from_json('{"a":"1 days"}', 'a interval')
+SELECT
+  from_csv('1, 1 day', 'a INT, b interval'),
+  to_csv(from_csv('1, 1 day', 'a INT, b interval')),
+  to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)),
+  from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval')
 -- !query schema
-struct<from_json({"a":"1 days"}):struct<a:interval>>
+struct<from_csv(1, 1 day):struct<a:int,b:interval>,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct<a:interval,b:interval>>
 -- !query output
-{"a":1 days}
+{"a":1,"b":1 days}	1,1 days	2 years 8 months,1 hours 10 minutes	{"a":2 years 8 months,"b":1 hours 10 minutes}
 
 
 -- !query
-SELECT to_json(map('a', interval 25 month 100 day 130 minute))
+SELECT
+  from_json('{"a":"1 days"}', 'a interval'),
+  to_json(from_json('{"a":"1 days"}', 'a interval')),
+  to_json(map('a', interval 25 month 100 day 130 minute)),
+  from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval')
 -- !query schema
-struct<to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string>
+struct<from_json({"a":"1 days"}):struct<a:interval>,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct<a:interval>>
 -- !query output
-{"a":"2 years 1 months 100 days 2 hours 10 minutes"}
+{"a":1 days}	{"a":"1 days"}	{"a":"2 years 1 months 100 days 2 hours 10 minutes"}	{"a":2 years 1 months 100 days 2 hours 10 minutes}

From 70e545a94d47afb2848c24e81c908d28d41016da Mon Sep 17 00:00:00 2001
From: Terry Kim <yuminkim@gmail.com>
Date: Mon, 10 Feb 2020 19:04:49 +0800
Subject: [PATCH 002/185] [SPARK-30757][SQL][DOC] Update the doc on
 TableCatalog.alterTable's behavior

### What changes were proposed in this pull request?

This PR updates the documentation on `TableCatalog.alterTable`s behavior on the order by which the requested changes are applied. It now explicitly mentions that the changes are applied in the order given.

### Why are the changes needed?

The current documentation on `TableCatalog.alterTable` doesn't mention which order the requested changes are applied. It will be useful to explicitly document this behavior so that the user can expect the behavior. For example, `REPLACE COLUMNS` needs to delete columns before adding new columns, and if the order is guaranteed by `alterTable`, it's much easier to work with the catalog API.

### Does this PR introduce any user-facing change?

Yes, document change.

### How was this patch tested?

Not added (doc changes).

Closes #27496 from imback82/catalog_table_alter_table.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/connector/catalog/TableCatalog.java    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
index a69b23bf84d0c..2f102348ec517 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
@@ -134,6 +134,8 @@ Table createTable(
    * Implementations may reject the requested changes. If any change is rejected, none of the
    * changes should be applied to the table.
    * <p>
+   * The requested changes must be applied in the order given.
+   * <p>
    * If the catalog supports views and contains a view for the identifier and not a table, this
    * must throw {@link NoSuchTableException}.
    *

From 5a240603fd920e3cb5d9ef49c31d46df8a630d8c Mon Sep 17 00:00:00 2001
From: jiake <ke.a.jia@intel.com>
Date: Mon, 10 Feb 2020 21:48:00 +0800
Subject: [PATCH 003/185] [SPARK-30719][SQL] Add unit test to verify the log
 warning print when intentionally skip AQE

### What changes were proposed in this pull request?

This is a follow up in [#27452](https://github.com/apache/spark/pull/27452).
Add a unit test to verify whether the log warning is print when intentionally skip AQE.

### Why are the changes needed?

Add unit test

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

adding unit test

Closes #27515 from JkSelf/aqeLoggingWarningTest.

Authored-by: jiake <ke.a.jia@intel.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../adaptive/AdaptiveQueryExecSuite.scala         | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 96e977221e512..a2071903bea7e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -789,4 +789,19 @@ class AdaptiveQueryExecSuite
       assert(plan.isInstanceOf[AdaptiveSparkPlanExec])
     }
   }
+
+  test("SPARK-30719: do not log warning if intentionally skip AQE") {
+    val testAppender = new LogAppender("aqe logging warning test when skip")
+    withLogAppender(testAppender) {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+        val plan = sql("SELECT * FROM testData").queryExecution.executedPlan
+        assert(!plan.isInstanceOf[AdaptiveSparkPlanExec])
+      }
+    }
+    assert(!testAppender.loggingEvents
+      .exists(msg => msg.getRenderedMessage.contains(
+        s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is" +
+        s" enabled but is not supported for")))
+  }
 }

From b2011a295bd78b3693a516e049e90250366b8f52 Mon Sep 17 00:00:00 2001
From: Eric Wu <492960551@qq.com>
Date: Mon, 10 Feb 2020 23:41:39 +0800
Subject: [PATCH 004/185] [SPARK-30326][SQL] Raise exception if analyzer exceed
 max iterations

### What changes were proposed in this pull request?
Enhance RuleExecutor strategy to take different actions when exceeding max iterations. And raise exception if analyzer exceed max iterations.

### Why are the changes needed?
Currently, both analyzer and optimizer just log warning message if rule execution exceed max iterations. They should have different behavior. Analyzer should raise exception to indicates the plan is not fixed after max iterations, while optimizer just log warning to keep the current plan. This is more feasible after SPARK-30138 was introduced.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Add test in AnalysisSuite

Closes #26977 from Eric5553/EnhanceMaxIterations.

Authored-by: Eric Wu <492960551@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 10 ++++++-
 .../sql/catalyst/optimizer/Optimizer.scala    |  5 +++-
 .../sql/catalyst/rules/RuleExecutor.scala     | 27 ++++++++++++++++---
 .../sql/catalyst/analysis/AnalysisSuite.scala | 25 ++++++++++++++++-
 4 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 75f1aa7185ef3..ce82b3b567b54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -176,7 +176,15 @@ class Analyzer(
 
   def resolver: Resolver = conf.resolver
 
-  protected val fixedPoint = FixedPoint(maxIterations)
+  /**
+   * If the plan cannot be resolved within maxIterations, analyzer will throw exception to inform
+   * user to increase the value of SQLConf.ANALYZER_MAX_ITERATIONS.
+   */
+  protected val fixedPoint =
+    FixedPoint(
+      maxIterations,
+      errorOnExceed = true,
+      maxIterationsSetting = SQLConf.ANALYZER_MAX_ITERATIONS.key)
 
   /**
    * Override to provide additional rules for the "Resolution" batch.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 935d62015afa1..08acac18f48bb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -53,7 +53,10 @@ abstract class Optimizer(catalogManager: CatalogManager)
       "PartitionPruning",
       "Extract Python UDFs")
 
-  protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
+  protected def fixedPoint =
+    FixedPoint(
+      SQLConf.get.optimizerMaxIterations,
+      maxIterationsSetting = SQLConf.OPTIMIZER_MAX_ITERATIONS.key)
 
   /**
    * Defines the default rule batches in the Optimizer.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 287ae0e8e9f67..da5242bee28e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -45,7 +45,17 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
    * An execution strategy for rules that indicates the maximum number of executions. If the
    * execution reaches fix point (i.e. converge) before maxIterations, it will stop.
    */
-  abstract class Strategy { def maxIterations: Int }
+  abstract class Strategy {
+
+    /** The maximum number of executions. */
+    def maxIterations: Int
+
+    /** Whether to throw exception when exceeding the maximum number. */
+    def errorOnExceed: Boolean = false
+
+    /** The key of SQLConf setting to tune maxIterations */
+    def maxIterationsSetting: String = null
+  }
 
   /** A strategy that is run once and idempotent. */
   case object Once extends Strategy { val maxIterations = 1 }
@@ -54,7 +64,10 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
    * A strategy that runs until fix point or maxIterations times, whichever comes first.
    * Especially, a FixedPoint(1) batch is supposed to run only once.
    */
-  case class FixedPoint(maxIterations: Int) extends Strategy
+  case class FixedPoint(
+    override val maxIterations: Int,
+    override val errorOnExceed: Boolean = false,
+    override val maxIterationsSetting: String = null) extends Strategy
 
   /** A batch of rules. */
   protected case class Batch(name: String, strategy: Strategy, rules: Rule[TreeType]*)
@@ -155,8 +168,14 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         if (iteration > batch.strategy.maxIterations) {
           // Only log if this is a rule that is supposed to run more than once.
           if (iteration != 2) {
-            val message = s"Max iterations (${iteration - 1}) reached for batch ${batch.name}"
-            if (Utils.isTesting) {
+            val endingMsg = if (batch.strategy.maxIterationsSetting == null) {
+              "."
+            } else {
+              s", please set '${batch.strategy.maxIterationsSetting}' to a larger value."
+            }
+            val message = s"Max iterations (${iteration - 1}) reached for batch ${batch.name}" +
+              s"$endingMsg"
+            if (Utils.isTesting || batch.strategy.errorOnExceed) {
               throw new TreeNodeException(curPlan, message, null)
             } else {
               logWarning(message)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index c747d394b1bc2..d38513319388b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -25,9 +25,10 @@ import org.scalatest.Matchers
 
 import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Count, Sum}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan
@@ -745,4 +746,26 @@ class AnalysisSuite extends AnalysisTest with Matchers {
       CollectMetrics("evt1", sumWithFilter :: Nil, testRelation),
       "aggregates with filter predicate are not allowed" :: Nil)
   }
+
+  test("Analysis exceed max iterations") {
+    // RuleExecutor only throw exception or log warning when the rule is supposed to run
+    // more than once.
+    val maxIterations = 2
+    val conf = new SQLConf().copy(SQLConf.ANALYZER_MAX_ITERATIONS -> maxIterations)
+    val testAnalyzer = new Analyzer(
+      new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf), conf)
+
+    val plan = testRelation2.select(
+      $"a" / Literal(2) as "div1",
+      $"a" / $"b" as "div2",
+      $"a" / $"c" as "div3",
+      $"a" / $"d" as "div4",
+      $"e" / $"e" as "div5")
+
+    val message = intercept[TreeNodeException[LogicalPlan]] {
+      testAnalyzer.execute(plan)
+    }.getMessage
+    assert(message.startsWith(s"Max iterations ($maxIterations) reached for batch Resolution, " +
+      s"please set '${SQLConf.ANALYZER_MAX_ITERATIONS.key}' to a larger value."))
+  }
 }

From acfdb46a60fc06dac0af55951492d74b7073f546 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 10 Feb 2020 10:45:00 -0800
Subject: [PATCH 005/185] [SPARK-27946][SQL][FOLLOW-UP] Change doc and error
 message for SHOW CREATE TABLE

### What changes were proposed in this pull request?

This is a follow-up for #24938 to tweak error message and migration doc.

### Why are the changes needed?

Making user know workaround if SHOW CREATE TABLE doesn't work for some Hive tables.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Existing unit tests.

Closes #27505 from viirya/SPARK-27946-followup.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <liangchi@uber.com>
---
 docs/sql-migration-guide.md                               | 2 +-
 .../org/apache/spark/sql/execution/command/tables.scala   | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index be0fe32ded99b..26eb5838892b4 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -326,7 +326,7 @@ license: |
 
   - Since Spark 3.0, `SHOW TBLPROPERTIES` will cause `AnalysisException` if the table does not exist. In Spark version 2.4 and earlier, this scenario caused `NoSuchTableException`. Also, `SHOW TBLPROPERTIES` on a temporary view will cause `AnalysisException`. In Spark version 2.4 and earlier, it returned an empty result.
 
-  - Since Spark 3.0, `SHOW CREATE TABLE` will always return Spark DDL, even when the given table is a Hive serde table. For Hive DDL, please use `SHOW CREATE TABLE AS SERDE` command instead.
+  - Since Spark 3.0, `SHOW CREATE TABLE` will always return Spark DDL, even when the given table is a Hive serde table. For generating Hive DDL, please use `SHOW CREATE TABLE AS SERDE` command instead.
 
   - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3. This may need to set `spark.sql.hive.metastore.version` and `spark.sql.hive.metastore.jars` according to the version of the Hive metastore.
   For example: set `spark.sql.hive.metastore.version` to `1.2.1` and `spark.sql.hive.metastore.jars` to `maven` if your Hive metastore version is 1.2.1.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 468ca505cce1f..90dbdf5515d4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -1076,7 +1076,9 @@ case class ShowCreateTableCommand(table: TableIdentifier)
             "Failed to execute SHOW CREATE TABLE against table " +
               s"${tableMetadata.identifier}, which is created by Hive and uses the " +
               "following unsupported feature(s)\n" +
-              tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n")
+              tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n") + ". " +
+              s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " +
+              "to show Hive DDL instead."
           )
         }
 
@@ -1086,7 +1088,9 @@ case class ShowCreateTableCommand(table: TableIdentifier)
 
         if ("true".equalsIgnoreCase(tableMetadata.properties.getOrElse("transactional", "false"))) {
           throw new AnalysisException(
-            "SHOW CREATE TABLE doesn't support transactional Hive table")
+            "SHOW CREATE TABLE doesn't support transactional Hive table. " +
+              s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " +
+              "to show Hive DDL instead.")
         }
 
         convertTableMetadata(tableMetadata)

From 4439b29bd2ac0c3cc4c6ceea825fc797ff0029a3 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Mon, 10 Feb 2020 10:56:43 -0800
Subject: [PATCH 006/185] Revert "[SPARK-30245][SQL] Add cache for Like and
 RLike when pattern is not static"

### What changes were proposed in this pull request?

This reverts commit 8ce7962931680c204e84dd75783b1c943ea9c525. There's variable name conflicts with https://github.com/apache/spark/commit/8aebc80e0e67bcb1aa300b8c8b1a209159237632#diff-39298b470865a4cbc67398a4ea11e767.

This can be cleanly ported back to branch-3.0.

### Why are the changes needed?
Performance investigation were not made enough and it's not clear if it really beneficial or now.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Jenkins tests.

Closes #27514 from HyukjinKwon/revert-cache-PR.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../expressions/regexpExpressions.scala       | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index c9ddc70bf5bc6..f84c476ea5807 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -177,6 +177,8 @@ case class Like(str: Expression, pattern: Expression, escape: Expression)
         """)
       }
     } else {
+      val patternStr = ctx.freshName("patternStr")
+      val compiledPattern = ctx.freshName("compiledPattern")
       // We need double escape to avoid org.codehaus.commons.compiler.CompileException.
       // '\\' will cause exception 'Single quote must be backslash-escaped in character literal'.
       // '\"' will cause exception 'Line break in literal not allowed'.
@@ -185,17 +187,11 @@ case class Like(str: Expression, pattern: Expression, escape: Expression)
       } else {
         escapeChar
       }
-      val patternStr = ctx.freshName("patternStr")
-      val compiledPattern = ctx.addMutableState(patternClass, "compiledPattern")
-      val lastPatternStr = ctx.addMutableState(classOf[String].getName, "lastPatternStr")
-
       nullSafeCodeGen(ctx, ev, (eval1, eval2, _) => {
         s"""
           String $patternStr = $eval2.toString();
-          if (!$patternStr.equals($lastPatternStr)) {
-            $compiledPattern = $patternClass.compile($escapeFunc($patternStr, '$newEscapeChar'));
-            $lastPatternStr = $patternStr;
-          }
+          $patternClass $compiledPattern = $patternClass.compile(
+            $escapeFunc($patternStr, '$newEscapeChar'));
           ${ev.value} = $compiledPattern.matcher($eval1.toString()).matches();
         """
       })
@@ -278,16 +274,11 @@ case class RLike(left: Expression, right: Expression)
       }
     } else {
       val rightStr = ctx.freshName("rightStr")
-      val pattern = ctx.addMutableState(patternClass, "pattern")
-      val lastRightStr = ctx.addMutableState(classOf[String].getName, "lastRightStr")
-
+      val pattern = ctx.freshName("pattern")
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
           String $rightStr = $eval2.toString();
-          if (!$rightStr.equals($lastRightStr)) {
-            $pattern = $patternClass.compile($rightStr);
-            $lastRightStr = $rightStr;
-          }
+          $patternClass $pattern = $patternClass.compile($rightStr);
           ${ev.value} = $pattern.matcher($eval1.toString()).find(0);
         """
       })

From 3c1c9b48fcca1a714e6c2a3045b512598438d672 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Mon, 10 Feb 2020 12:51:37 -0800
Subject: [PATCH 007/185] [SPARK-30759][SQL] Initialize cache for foldable
 patterns in StringRegexExpression

### What changes were proposed in this pull request?
In the PR, I propose to fix `cache` initialization in `StringRegexExpression` by changing `case Literal(value: String, StringType)` to `case p: Expression if p.foldable`

### Why are the changes needed?
Actually, the case doesn't work at all because of:
1. Literals value has type `UTF8String`
2. It doesn't work for foldable expressions like in the example:
```sql
SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*';
```
<img width="649" alt="Screen Shot 2020-02-08 at 22 45 50" src="https://user-images.githubusercontent.com/1580697/74091681-0d4a2180-4acb-11ea-8a0d-7e8c65f4214e.png">

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By the `check outputs of expression examples` test from `SQLQuerySuite`.

Closes #27502 from MaxGekk/str-regexp-foldable-pattern.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/catalyst/expressions/regexpExpressions.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index f84c476ea5807..f8d328bf601e4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -41,9 +41,10 @@ trait StringRegexExpression extends Expression
 
   override def dataType: DataType = BooleanType
 
-  // try cache the pattern for Literal
+  // try cache foldable pattern
   private lazy val cache: Pattern = pattern match {
-    case Literal(value: String, StringType) => compile(value)
+    case p: Expression if p.foldable =>
+      compile(p.eval().asInstanceOf[UTF8String].toString)
     case _ => null
   }
 

From a6b91d2bf727e175d0e175295001db85647539b1 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Mon, 10 Feb 2020 22:16:25 +0100
Subject: [PATCH 008/185] [SPARK-30556][SQL][FOLLOWUP] Reset the status changed
 in SQLExecution.withThreadLocalCaptured

### What changes were proposed in this pull request?
Follow up for #27267, reset the status changed in SQLExecution.withThreadLocalCaptured.

### Why are the changes needed?
For code safety.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing UT.

Closes #27516 from xuanyuanking/SPARK-30556-follow.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: herman <herman@databricks.com>
---
 .../apache/spark/sql/execution/SQLExecution.scala    | 12 +++++++++++-
 .../sql/internal/ExecutorSideSQLConfSuite.scala      | 10 ++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index 995d94ef5eac7..9f177819f6ea7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -177,9 +177,19 @@ object SQLExecution {
     val sc = sparkSession.sparkContext
     val localProps = Utils.cloneProperties(sc.getLocalProperties)
     Future {
+      val originalSession = SparkSession.getActiveSession
+      val originalLocalProps = sc.getLocalProperties
       SparkSession.setActiveSession(activeSession)
       sc.setLocalProperties(localProps)
-      body
+      val res = body
+      // reset active session and local props.
+      sc.setLocalProperties(originalLocalProps)
+      if (originalSession.nonEmpty) {
+        SparkSession.setActiveSession(originalSession.get)
+      } else {
+        SparkSession.clearActiveSession()
+      }
+      res
     }(exec)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
index 0cc658c499615..46d0c64592a00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.internal
 
+import java.util.UUID
+
 import org.scalatest.Assertions._
 
 import org.apache.spark.{SparkException, SparkFunSuite, TaskContext}
@@ -144,16 +146,16 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
         }
 
         // set local configuration and assert
-        val confValue1 = "e"
+        val confValue1 = UUID.randomUUID().toString()
         createDataframe(confKey, confValue1).createOrReplaceTempView("m")
         spark.sparkContext.setLocalProperty(confKey, confValue1)
-        assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect.size == 1)
+        assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect().length == 1)
 
         // change the conf value and assert again
-        val confValue2 = "f"
+        val confValue2 = UUID.randomUUID().toString()
         createDataframe(confKey, confValue2).createOrReplaceTempView("n")
         spark.sparkContext.setLocalProperty(confKey, confValue2)
-        assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().size == 1)
+        assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().length == 1)
       }
     }
   }

From e2ebca733ce4366349a5a25fe94a8e31b67d410e Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Mon, 10 Feb 2020 14:26:14 -0800
Subject: [PATCH 009/185] [SPARK-30779][SS] Fix some API issues found when
 reviewing Structured Streaming API docs

### What changes were proposed in this pull request?

- Fix the scope of `Logging.initializeForcefully` so that it doesn't appear in subclasses' public methods. Right now, `sc.initializeForcefully(false, false)` is allowed to called.
- Don't show classes under `org.apache.spark.internal` package in API docs.
- Add missing `since` annotation.
- Fix the scope of `ArrowUtils` to remove it from the API docs.

### Why are the changes needed?

Avoid leaking APIs unintentionally in Spark 3.0.0.

### Does this PR introduce any user-facing change?

No. All these changes are to avoid leaking APIs unintentionally in Spark 3.0.0.

### How was this patch tested?

Manually generated the API docs and verified the above issues have been fixed.

Closes #27528 from zsxwing/audit-ss-apis.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 core/src/main/scala/org/apache/spark/internal/Logging.scala     | 2 +-
 project/SparkBuild.scala                                        | 1 +
 .../sql/connector/read/streaming/ContinuousPartitionReader.java | 2 ++
 .../read/streaming/ContinuousPartitionReaderFactory.java        | 2 ++
 .../spark/sql/connector/read/streaming/ContinuousStream.java    | 2 ++
 .../spark/sql/connector/read/streaming/MicroBatchStream.java    | 2 ++
 .../org/apache/spark/sql/connector/read/streaming/Offset.java   | 2 ++
 .../spark/sql/connector/read/streaming/PartitionOffset.java     | 2 ++
 .../apache/spark/sql/connector/read/streaming/ReadLimit.java    | 1 +
 .../spark/sql/connector/read/streaming/SparkDataStream.java     | 2 ++
 .../connector/write/streaming/StreamingDataWriterFactory.java   | 2 ++
 .../spark/sql/connector/write/streaming/StreamingWrite.java     | 2 ++
 .../src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala   | 2 +-
 13 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala
index 2e4846bec2db4..0c1d9635b6535 100644
--- a/core/src/main/scala/org/apache/spark/internal/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -117,7 +117,7 @@ trait Logging {
   }
 
   // For testing
-  def initializeForcefully(isInterpreter: Boolean, silent: Boolean): Unit = {
+  private[spark] def initializeForcefully(isInterpreter: Boolean, silent: Boolean): Unit = {
     initializeLogging(isInterpreter, silent)
   }
 
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 707c31d2248eb..9d0af3aa8c1b6 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -819,6 +819,7 @@ object Unidoc {
       .map(_.filterNot(_.getName.contains("$")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/deploy")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/examples")))
+      .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/internal")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/memory")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/network")))
       .map(_.filterNot(f =>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReader.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReader.java
index 8bd5273bb7d8e..c2ad9ec244a0d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReader.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReader.java
@@ -22,6 +22,8 @@
 
 /**
  * A variation on {@link PartitionReader} for use with continuous streaming processing.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface ContinuousPartitionReader<T> extends PartitionReader<T> {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReaderFactory.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReaderFactory.java
index 962864da4aad8..385c6f655440f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReaderFactory.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousPartitionReaderFactory.java
@@ -27,6 +27,8 @@
 /**
  * A variation on {@link PartitionReaderFactory} that returns {@link ContinuousPartitionReader}
  * instead of {@link PartitionReader}. It's used for continuous streaming processing.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface ContinuousPartitionReaderFactory extends PartitionReaderFactory {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousStream.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousStream.java
index ee01a2553ae7a..a84578fe461a3 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousStream.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ContinuousStream.java
@@ -23,6 +23,8 @@
 
 /**
  * A {@link SparkDataStream} for streaming queries with continuous mode.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface ContinuousStream extends SparkDataStream {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java
index ceab0f75734d3..40ecbf0578ee5 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java
@@ -25,6 +25,8 @@
 
 /**
  * A {@link SparkDataStream} for streaming queries with micro-batch mode.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface MicroBatchStream extends SparkDataStream {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/Offset.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/Offset.java
index 400de2a659746..efb8ebb684f06 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/Offset.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/Offset.java
@@ -25,6 +25,8 @@
  * During execution, offsets provided by the data source implementation will be logged and used as
  * restart checkpoints. Each source should provide an offset implementation which the source can use
  * to reconstruct a position in the stream up to which data has been seen/processed.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public abstract class Offset {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/PartitionOffset.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/PartitionOffset.java
index 35ad3bbde5cbf..faee230467bea 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/PartitionOffset.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/PartitionOffset.java
@@ -26,6 +26,8 @@
  * provide a method to merge these into a global Offset.
  *
  * These offsets must be serializable.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface PartitionOffset extends Serializable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ReadLimit.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ReadLimit.java
index 121ed1ad116f9..36f6e05e365d9 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ReadLimit.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/ReadLimit.java
@@ -27,6 +27,7 @@
  * @see SupportsAdmissionControl#latestOffset(Offset, ReadLimit)
  * @see ReadAllAvailable
  * @see ReadMaxRows
+ * @since 3.0.0
  */
 @Evolving
 public interface ReadLimit {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/SparkDataStream.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/SparkDataStream.java
index 1ba0c25ef4466..95703e255ea4e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/SparkDataStream.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/SparkDataStream.java
@@ -25,6 +25,8 @@
  *
  * Data sources should implement concrete data stream interfaces:
  * {@link MicroBatchStream} and {@link ContinuousStream}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SparkDataStream {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingDataWriterFactory.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingDataWriterFactory.java
index 9946867e8ea65..0923d07e7e5a3 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingDataWriterFactory.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingDataWriterFactory.java
@@ -33,6 +33,8 @@
  * Note that, the writer factory will be serialized and sent to executors, then the data writer
  * will be created on executors and do the actual writing. So this interface must be
  * serializable and {@link DataWriter} doesn't need to be.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface StreamingDataWriterFactory extends Serializable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingWrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingWrite.java
index 4f930e1c158e5..e3dec3b2ff55e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingWrite.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/streaming/StreamingWrite.java
@@ -40,6 +40,8 @@
  * do it manually in their Spark applications if they want to retry.
  *
  * Please refer to the documentation of commit/abort methods for detailed specifications.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface StreamingWrite {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala
index 2da0d1a51cb29..003ce850c926e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala
@@ -27,7 +27,7 @@ import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
-object ArrowUtils {
+private[sql] object ArrowUtils {
 
   val rootAllocator = new RootAllocator(Long.MaxValue)
 

From 07a9885f2792be1353f4a923d649e90bc431cb38 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 11 Feb 2020 10:03:01 +0900
Subject: [PATCH 010/185] [SPARK-30777][PYTHON][TESTS] Fix test failures for
 Pandas >= 1.0.0

### What changes were proposed in this pull request?

Fix PySpark test failures for using Pandas >= 1.0.0.

### Why are the changes needed?

Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Manually tested with Pandas 1.0.1 and PyArrow 0.16.0

Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777.

Authored-by: Bryan Cutler <cutlerb@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/sql/tests/test_arrow.py                  | 4 ++--
 python/pyspark/sql/tests/test_pandas_grouped_map.py     | 6 +++---
 python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 98f44dfd29da5..004c79f290213 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -297,9 +297,9 @@ def test_createDataFrame_does_not_modify_input(self):
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
         # Use a nanosecond value to make sure it is not truncated
-        pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
+        pdf.iloc[0, 7] = pd.Timestamp(1)
         # Integers with nulls will get NaNs filled with 0 and will be casted
-        pdf.ix[1, '2_int_t'] = None
+        pdf.iloc[1, 1] = None
         pdf_copy = pdf.copy(deep=True)
         self.spark.createDataFrame(pdf, schema=self.schema)
         self.assertTrue(pdf.equals(pdf_copy))
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 51dd07fd7d70c..ff53a0c6f2cf2 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -390,11 +390,11 @@ def rename_pdf(pdf, names):
         # Function returns a pdf with required column names, but order could be arbitrary using dict
         def change_col_order(pdf):
             # Constructing a DataFrame from a dict should result in the same order,
-            # but use from_items to ensure the pdf column order is different than schema
-            return pd.DataFrame.from_items([
+            # but use OrderedDict to ensure the pdf column order is different than schema
+            return pd.DataFrame.from_dict(OrderedDict([
                 ('id', pdf.id),
                 ('u', pdf.v * 2),
-                ('v', pdf.v)])
+                ('v', pdf.v)]))
 
         ordered_udf = pandas_udf(
             change_col_order,
diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
index 974ad560daebf..21679785a769e 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@@ -357,7 +357,7 @@ def test_complex_expressions(self):
                         plus_one(sum_udf(col('v1'))),
                         sum_udf(plus_one(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected1 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -368,7 +368,7 @@ def test_complex_expressions(self):
                           plus_one(sum(col('v1'))),
                           sum(plus_one(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test complex expressions with sql expression, scala pandas UDF and
         # group aggregate pandas UDF
@@ -381,7 +381,7 @@ def test_complex_expressions(self):
                         plus_two(sum_udf(col('v1'))),
                         sum_udf(plus_two(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected2 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -392,7 +392,7 @@ def test_complex_expressions(self):
                           plus_two(sum(col('v1'))),
                           sum(plus_two(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test sequential groupby aggregate
         result3 = (df.groupby('id')

From 2bc765a831d7f15c7971d41c36cfbec1fd898dfd Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Tue, 11 Feb 2020 15:50:03 +0900
Subject: [PATCH 011/185] [SPARK-30756][SQL] Fix
 `ThriftServerWithSparkContextSuite` on
 spark-branch-3.0-test-sbt-hadoop-2.7-hive-2.3

### What changes were proposed in this pull request?

This PR tries #26710 (comment) way to fix the test.

### Why are the changes needed?

To make the tests pass.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Jenkins will test first, and then `on spark-branch-3.0-test-sbt-hadoop-2.7-hive-2.3` will test it out.

Closes #27513 from HyukjinKwon/test-SPARK-30756.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 8efe367a4ee862b8a85aee8881b0335b34cbba70)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 project/SparkBuild.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 9d0af3aa8c1b6..1c5c36ea8eae2 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -478,7 +478,8 @@ object SparkParallelTestGrouping {
     "org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite",
     "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite",
     "org.apache.spark.sql.hive.thriftserver.ui.ThriftServerPageSuite",
-    "org.apache.spark.sql.hive.thriftserver.ui.HiveThriftServer2ListenerSuite"
+    "org.apache.spark.sql.hive.thriftserver.ui.HiveThriftServer2ListenerSuite",
+    "org.apache.spark.sql.hive.thriftserver.ThriftServerWithSparkContextSuite"
   )
 
   private val DEFAULT_TEST_GROUP = "default_test_group"

From 0045be766b949dff23ed72bd559568f17f645ffe Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Tue, 11 Feb 2020 17:22:08 +0900
Subject: [PATCH 012/185] [SPARK-29462][SQL] The data type of "array()" should
 be array<null>

### What changes were proposed in this pull request?

This brings https://github.com/apache/spark/pull/26324 back. It was reverted basically because, firstly Hive compatibility, and the lack of investigations in other DBMSes and ANSI.

- In case of PostgreSQL seems coercing NULL literal to TEXT type.
- Presto seems coercing `array() + array(1)` -> array of int.
- Hive seems  `array() + array(1)` -> array of strings

 Given that, the design choices have been differently made for some reasons. If we pick one of both, seems coercing to array of int makes much more sense.

Another investigation was made offline internally. Seems ANSI SQL 2011, section 6.5 "<contextually typed value specification>" states:

> If ES is specified, then let ET be the element type determined by the context in which ES appears. The declared type DT of ES is Case:
>
> a) If ES simply contains ARRAY, then ET ARRAY[0].
>
> b) If ES simply contains MULTISET, then ET MULTISET.
>
> ES is effectively replaced by CAST ( ES AS DT )

From reading other related context, doing it to `NullType`. Given the investigation made, choosing to `null` seems correct, and we have a reference Presto now. Therefore, this PR proposes to bring it back.

### Why are the changes needed?
When empty array is created, it should be declared as array<null>.

### Does this PR introduce any user-facing change?
Yes, `array()` creates `array<null>`. Now `array(1) + array()` can correctly create `array(1)` instead of `array("1")`.

### How was this patch tested?
Tested manually

Closes #27521 from HyukjinKwon/SPARK-29462.

Lead-authored-by: HyukjinKwon <gurwls223@apache.org>
Co-authored-by: Aman Omer <amanomer1996@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/sql-migration-guide.md                     |  2 ++
 .../expressions/complexTypeCreator.scala        | 11 ++++++++++-
 .../org/apache/spark/sql/internal/SQLConf.scala |  9 +++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala     | 17 +++++++++++++----
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 26eb5838892b4..f98fab5b4c56b 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -215,6 +215,8 @@ license: |
   For example `SELECT timestamp 'tomorrow';`.
 
   - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`.
+  
+  - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`.
 
   - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception.
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 9ce87a4922c01..7335e305bfe55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -44,10 +45,18 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
     TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName")
   }
 
+  private val defaultElementType: DataType = {
+    if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) {
+      StringType
+    } else {
+      NullType
+    }
+  }
+
   override def dataType: ArrayType = {
     ArrayType(
       TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(children.map(_.dataType))
-        .getOrElse(StringType),
+        .getOrElse(defaultElementType),
       containsNull = children.exists(_.nullable))
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 64c613611c861..d86f8693e0655 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2007,6 +2007,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_ARRAY_DEFAULT_TO_STRING =
+    buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled")
+      .internal()
+      .doc("When set to true, it returns an empty array of string type when the `array` " +
+        "function is called without any parameters. Otherwise, it returns an empty " +
+        "array of `NullType`")
+      .booleanConf
+      .createWithDefault(false)
+
   val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL =
     buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled")
       .internal()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 7fce03658fc16..9e9d8c3e9a7c5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3499,12 +3499,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
 
-  test("SPARK-21281 use string types by default if array and map have no argument") {
+  test("SPARK-21281 use string types by default if map have no argument") {
     val ds = spark.range(1)
     var expectedSchema = new StructType()
-      .add("x", ArrayType(StringType, containsNull = false), nullable = false)
-    assert(ds.select(array().as("x")).schema == expectedSchema)
-    expectedSchema = new StructType()
       .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false)
     assert(ds.select(map().as("x")).schema == expectedSchema)
   }
@@ -3577,6 +3574,18 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     }.getMessage
     assert(nonFoldableError.contains("The 'escape' parameter must be a string literal"))
   }
+
+  test("SPARK-29462: Empty array of NullType for array function with no arguments") {
+    Seq((true, StringType), (false, NullType)).foreach {
+      case (arrayDefaultToString, expectedType) =>
+        withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) {
+          val schema = spark.range(1).select(array()).schema
+          assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType])
+          val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType
+          assert(actualType === expectedType)
+        }
+    }
+  }
 }
 
 object DataFrameFunctionsSuite {

From b20754d9ee033091e2ef4d5bfa2576f946c9df50 Mon Sep 17 00:00:00 2001
From: root1 <raksonrakesh@gmail.com>
Date: Tue, 11 Feb 2020 20:42:02 +0800
Subject: [PATCH 013/185] [SPARK-27545][SQL][DOC] Update the Documentation for
 CACHE TABLE and UNCACHE TABLE

### What changes were proposed in this pull request?
Document updated for `CACHE TABLE` & `UNCACHE TABLE`

### Why are the changes needed?
Cache table creates a temp view while caching data using `CACHE TABLE name AS query`. `UNCACHE TABLE` does not remove this temp view.

These things were not mentioned in the existing doc for `CACHE TABLE` & `UNCACHE TABLE`.

### Does this PR introduce any user-facing change?
Document updated for `CACHE TABLE` & `UNCACHE TABLE` command.

### How was this patch tested?
Manually

Closes #27090 from iRakson/SPARK-27545.

Lead-authored-by: root1 <raksonrakesh@gmail.com>
Co-authored-by: iRakson <raksonrakesh@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-ref-syntax-aux-cache-cache-table.md   | 3 ++-
 docs/sql-ref-syntax-aux-cache-uncache-table.md | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/sql-ref-syntax-aux-cache-cache-table.md b/docs/sql-ref-syntax-aux-cache-cache-table.md
index ed6ef973466dd..20ade1961ab0b 100644
--- a/docs/sql-ref-syntax-aux-cache-cache-table.md
+++ b/docs/sql-ref-syntax-aux-cache-cache-table.md
@@ -20,7 +20,8 @@ license: |
 ---
 
 ### Description
-`CACHE TABLE` statement caches contents of a table or output of a query with the given storage level. This reduces scanning of the original files in future queries.
+`CACHE TABLE` statement caches contents of a table or output of a query with the given storage level. If a query is cached, then a temp view will be created for this query.
+This reduces scanning of the original files in future queries. 
 
 ### Syntax
 {% highlight sql %}
diff --git a/docs/sql-ref-syntax-aux-cache-uncache-table.md b/docs/sql-ref-syntax-aux-cache-uncache-table.md
index e0581d0d213df..69e21c258a333 100644
--- a/docs/sql-ref-syntax-aux-cache-uncache-table.md
+++ b/docs/sql-ref-syntax-aux-cache-uncache-table.md
@@ -21,11 +21,13 @@ license: |
 
 ### Description
 `UNCACHE TABLE` removes the entries and associated data from the in-memory and/or on-disk cache for a given table or view. The
-underlying entries should already have been brought to cache by previous `CACHE TABLE` operation. `UNCACHE TABLE` on a non-existent table throws Exception if `IF EXISTS` is not specified.
+underlying entries should already have been brought to cache by previous `CACHE TABLE` operation. `UNCACHE TABLE` on a non-existent table throws an exception if `IF EXISTS` is not specified.
+
 ### Syntax
 {% highlight sql %}
 UNCACHE TABLE [ IF EXISTS ] table_identifier
 {% endhighlight %}
+
 ### Parameters
 <dl>
   <dt><code><em>table_identifier</em></code></dt>
@@ -37,10 +39,12 @@ UNCACHE TABLE [ IF EXISTS ] table_identifier
       </code>
   </dd>
 </dl>
+
 ### Examples
 {% highlight sql %}
 UNCACHE TABLE t1;
 {% endhighlight %}
+
 ### Related Statements
  * [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html)
  * [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html)

From f1d0dce4848a53831268c80bf7e1e0f47a1f7612 Mon Sep 17 00:00:00 2001
From: fuwhu <bestwwg@163.com>
Date: Tue, 11 Feb 2020 22:16:44 +0800
Subject: [PATCH 014/185] [MINOR][DOC] Add class document for
 PruneFileSourcePartitions and PruneHiveTablePartitions

### What changes were proposed in this pull request?
Add class document for PruneFileSourcePartitions and PruneHiveTablePartitions.

### Why are the changes needed?
To describe these two classes.

### Does this PR introduce any user-facing change?
no

### How was this patch tested?
no

Closes #27535 from fuwhu/SPARK-15616-FOLLOW-UP.

Authored-by: fuwhu <bestwwg@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/PruneFileSourcePartitions.scala     | 13 +++++++++++++
 .../hive/execution/PruneHiveTablePartitions.scala   |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 1ea19c187e51a..a7129fb14d1a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -26,6 +26,19 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileScan}
 import org.apache.spark.sql.types.StructType
 
+/**
+ * Prune the partitions of file source based table using partition filters. Currently, this rule
+ * is applied to [[HadoopFsRelation]] with [[CatalogFileIndex]] and [[DataSourceV2ScanRelation]]
+ * with [[FileScan]].
+ *
+ * For [[HadoopFsRelation]], the location will be replaced by pruned file index, and corresponding
+ * statistics will be updated. And the partition filters will be kept in the filters of returned
+ * logical plan.
+ *
+ * For [[DataSourceV2ScanRelation]], both partition filters and data filters will be added to
+ * its underlying [[FileScan]]. And the partition filters will be removed in the filters of
+ * returned logical plan.
+ */
 private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
 
   private def getPartitionKeyFiltersAndDataFilters(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala
index a0349f627d107..da6e4c52cf3a7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala
@@ -30,6 +30,14 @@ import org.apache.spark.sql.execution.datasources.DataSourceStrategy
 import org.apache.spark.sql.internal.SQLConf
 
 /**
+ * Prune hive table partitions using partition filters on [[HiveTableRelation]]. The pruned
+ * partitions will be kept in [[HiveTableRelation.prunedPartitions]], and the statistics of
+ * the hive table relation will be updated based on pruned partitions.
+ *
+ * This rule is executed in optimization phase, so the statistics can be updated before physical
+ * planning, which is useful for some spark strategy, eg.
+ * [[org.apache.spark.sql.execution.SparkStrategies.JoinSelection]].
+ *
  * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source.
  */
 private[sql] class PruneHiveTablePartitions(session: SparkSession)

From dc66d57e981ac5108e097d4298fa467f0843ffcf Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Tue, 11 Feb 2020 09:07:40 -0600
Subject: [PATCH 015/185] [SPARK-30754][SQL] Reuse results of floorDiv in
 calculations of floorMod in DateTimeUtils

### What changes were proposed in this pull request?
In the case of back-to-back calculation of `floorDiv` and `floorMod` with the same arguments, the result of `foorDiv` can be reused in calculation of `floorMod`. The `floorMod` method is defined as the following in Java standard library:
```java
    public static int floorMod(int x, int y) {
        int r = x - floorDiv(x, y) * y;
        return r;
    }
```
If `floorDiv(x, y)` has been already calculated, it can be reused in `x - floorDiv(x, y) * y`.

I propose to modify 2 places in `DateTimeUtils`:
1. `microsToInstant` which is widely used in many date-time functions. `Math.floorMod(us, MICROS_PER_SECOND)` is just replaced by its definition from Java Math library.
2. `truncDate`: `Math.floorMod(oldYear, divider) == 0` is replaced by `Math.floorDiv(oldYear, divider) * divider == oldYear` where `floorDiv(...) * divider` is pre-calculated.

### Why are the changes needed?
This reduces the number of arithmetic operations, and can slightly improve performance of date-time functions.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By existing test suites `DateTimeUtilsSuite`, `DateFunctionsSuite` and `DateExpressionsSuite`.

Closes #27491 from MaxGekk/opt-microsToInstant.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala      | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 8eb560944d4cb..eeae0674166bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -337,7 +337,9 @@ object DateTimeUtils {
 
   def microsToInstant(us: Long): Instant = {
     val secs = Math.floorDiv(us, MICROS_PER_SECOND)
-    val mos = Math.floorMod(us, MICROS_PER_SECOND)
+    // Unfolded Math.floorMod(us, MICROS_PER_SECOND) to reuse the result of
+    // the above calculation of `secs` via `floorDiv`.
+    val mos = us - secs * MICROS_PER_SECOND
     Instant.ofEpochSecond(secs, mos * NANOS_PER_MICROS)
   }
 
@@ -691,11 +693,11 @@ object DateTimeUtils {
   def truncDate(d: SQLDate, level: Int): SQLDate = {
     def truncToYearLevel(divider: Int, adjust: Int): SQLDate = {
       val oldYear = getYear(d)
-      var newYear = Math.floorDiv(oldYear, divider)
-      if (adjust > 0 && Math.floorMod(oldYear, divider) == 0) {
-        newYear -= 1
+      var newYear = Math.floorDiv(oldYear, divider) * divider
+      if (adjust > 0 && newYear == oldYear) {
+        newYear -= divider
       }
-      newYear = newYear * divider + adjust
+      newYear += adjust
       localDateToDays(LocalDate.of(newYear, 1, 1))
     }
     level match {

From ea626b6acf0de0ff3b0678372f30ba6f84ae2b09 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 12 Feb 2020 00:12:45 +0800
Subject: [PATCH 016/185] [SPARK-30783] Exclude hive-service-rpc

### What changes were proposed in this pull request?
Exclude hive-service-rpc from build.

### Why are the changes needed?
hive-service-rpc 2.3.6 and spark sql's thrift server module have duplicate classes. Leaving hive-service-rpc 2.3.6 in the class path means that spark can pick up classes defined in hive instead of its thrift server module, which can cause hard to debug runtime errors due to class loading order and compilation errors for applications depend on spark.

 If you compare hive-service-rpc 2.3.6's jar (https://search.maven.org/remotecontent?filepath=org/apache/hive/hive-service-rpc/2.3.6/hive-service-rpc-2.3.6.jar) and spark thrift server's jar (e.g. https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-hive-thriftserver_2.12/3.0.0-SNAPSHOT/spark-hive-thriftserver_2.12-3.0.0-20200207.021914-364.jar), you will see that all of classes provided by hive-service-rpc-2.3.6.jar are covered by spark thrift server's jar. https://issues.apache.org/jira/browse/SPARK-30783 has output of jar tf for both jars.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Existing tests.

Closes #27533 from yhuai/SPARK-30783.

Authored-by: Yin Huai <yhuai@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 dev/deps/spark-deps-hadoop-2.7-hive-2.3 |  1 -
 dev/deps/spark-deps-hadoop-3.2-hive-2.3 |  1 -
 pom.xml                                 | 20 ++++++++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
index 42bdf112efccb..c50cf96dc9065 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
@@ -87,7 +87,6 @@ hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar
 hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar
 hive-metastore/2.3.6//hive-metastore-2.3.6.jar
 hive-serde/2.3.6//hive-serde-2.3.6.jar
-hive-service-rpc/2.3.6//hive-service-rpc-2.3.6.jar
 hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar
 hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar
 hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar
diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
index 6006fa4b43f42..c37ce7fab36f6 100644
--- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
@@ -86,7 +86,6 @@ hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar
 hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar
 hive-metastore/2.3.6//hive-metastore-2.3.6.jar
 hive-serde/2.3.6//hive-serde-2.3.6.jar
-hive-service-rpc/2.3.6//hive-service-rpc-2.3.6.jar
 hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar
 hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar
 hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar
diff --git a/pom.xml b/pom.xml
index a8d6ac932bac2..925fa28a291a4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1452,6 +1452,11 @@
             <groupId>${hive.group}</groupId>
             <artifactId>hive-service</artifactId>
           </exclusion>
+          <exclusion>
+            <!-- All classes are covered by spark's hive-thriftserver module -->
+            <groupId>${hive.group}</groupId>
+            <artifactId>hive-service-rpc</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
             <artifactId>hive-shims</artifactId>
@@ -1508,6 +1513,11 @@
             <groupId>${hive.group}</groupId>
             <artifactId>hive-service</artifactId>
           </exclusion>
+          <exclusion>
+            <!-- All classes are covered by spark's hive-thriftserver module -->
+            <groupId>${hive.group}</groupId>
+            <artifactId>hive-service-rpc</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
             <artifactId>hive-shims</artifactId>
@@ -1761,6 +1771,11 @@
             <groupId>${hive.group}</groupId>
             <artifactId>hive-service</artifactId>
           </exclusion>
+          <exclusion>
+            <!-- All classes are covered by spark's hive-thriftserver module -->
+            <groupId>${hive.group}</groupId>
+            <artifactId>hive-service-rpc</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>${hive.group}</groupId>
             <artifactId>hive-shims</artifactId>
@@ -1911,6 +1926,11 @@
             <artifactId>groovy-all</artifactId>
           </exclusion>
           <!-- Begin of Hive 2.3 exclusion -->
+          <exclusion>
+            <!-- All classes are covered by spark's hive-thriftserver module -->
+            <groupId>${hive.group}</groupId>
+            <artifactId>hive-service-rpc</artifactId>
+          </exclusion>
           <!-- parquet-hadoop-bundle:1.8.1 conflict with 1.10.1 -->
           <exclusion>
             <groupId>org.apache.parquet</groupId>

From 99bd59fe29a87bb70485db536b0ae676e7a9d42e Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Tue, 11 Feb 2020 09:55:02 -0800
Subject: [PATCH 017/185] [SPARK-29462][SQL][DOCS] Add some more context and
 details in 'spark.sql.defaultUrlStreamHandlerFactory.enabled' documentation

### What changes were proposed in this pull request?

This PR adds some more information and context to `spark.sql.defaultUrlStreamHandlerFactory.enabled`.

### Why are the changes needed?

It is a bit difficult to understand the documentation of `spark.sql.defaultUrlStreamHandlerFactory.enabled`.

### Does this PR introduce any user-facing change?

Nope, internal doc only fix.

### How was this patch tested?

Nope. I only tested linter.

Closes #27541 from HyukjinKwon/SPARK-29462-followup.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../org/apache/spark/sql/internal/StaticSQLConf.scala     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
index 6bc752260a893..563e51ed597b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
@@ -172,7 +172,13 @@ object StaticSQLConf {
 
   val DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED =
     buildStaticConf("spark.sql.defaultUrlStreamHandlerFactory.enabled")
-      .doc("When true, set FsUrlStreamHandlerFactory to support ADD JAR against HDFS locations")
+      .doc(
+        "When true, register Hadoop's FsUrlStreamHandlerFactory to support " +
+        "ADD JAR against HDFS locations. " +
+        "It should be disabled when a different stream protocol handler should be registered " +
+        "to support a particular protocol type, or if Hadoop's FsUrlStreamHandlerFactory " +
+        "conflicts with other protocol types such as `http` or `https`. See also SPARK-25694 " +
+        "and HADOOP-14598.")
       .internal()
       .booleanConf
       .createWithDefault(true)

From 45db48e2d29359591a4ebc3db4625dd2158e446e Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Tue, 11 Feb 2020 10:15:34 -0800
Subject: [PATCH 018/185] Revert "[SPARK-30625][SQL] Support `escape` as third
 parameter of the `like` function

### What changes were proposed in this pull request?

In the PR, I propose to revert the commit 8aebc80e0e67bcb1aa300b8c8b1a209159237632.

### Why are the changes needed?
See the concerns https://github.com/apache/spark/pull/27355#issuecomment-584344438

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By existing test suites.

Closes #27531 from MaxGekk/revert-like-3-args.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/catalyst/dsl/package.scala      |  2 +-
 .../expressions/regexpExpressions.scala       | 85 ++++++-------------
 .../sql/catalyst/parser/AstBuilder.scala      |  4 +-
 .../spark/sql/DataFrameFunctionsSuite.scala   | 15 ----
 4 files changed, 31 insertions(+), 75 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 40998080bc4e3..b4a8bafe22dfb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -99,7 +99,7 @@ package object dsl {
     }
 
     def like(other: Expression, escapeChar: Char = '\\'): Expression =
-      Like(expr, other, Literal(escapeChar.toString))
+      Like(expr, other, escapeChar)
     def rlike(other: Expression): Expression = RLike(expr, other)
     def contains(other: Expression): Expression = Contains(expr, other)
     def startsWith(other: Expression): Expression = StartsWith(expr, other)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index f8d328bf601e4..e5ee0edfcf79b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -22,7 +22,6 @@ import java.util.regex.{MatchResult, Pattern}
 
 import org.apache.commons.text.StringEscapeUtils
 
-import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util.{GenericArrayData, StringUtils}
@@ -30,19 +29,17 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 
-trait StringRegexExpression extends Expression
+abstract class StringRegexExpression extends BinaryExpression
   with ImplicitCastInputTypes with NullIntolerant {
 
-  def str: Expression
-  def pattern: Expression
-
   def escape(v: String): String
   def matches(regex: Pattern, str: String): Boolean
 
   override def dataType: DataType = BooleanType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
   // try cache foldable pattern
-  private lazy val cache: Pattern = pattern match {
+  private lazy val cache: Pattern = right match {
     case p: Expression if p.foldable =>
       compile(p.eval().asInstanceOf[UTF8String].toString)
     case _ => null
@@ -55,9 +52,10 @@ trait StringRegexExpression extends Expression
     Pattern.compile(escape(str))
   }
 
-  def nullSafeMatch(input1: Any, input2: Any): Any = {
-    val s = input2.asInstanceOf[UTF8String].toString
-    val regex = if (cache == null) compile(s) else cache
+  protected def pattern(str: String) = if (cache == null) compile(str) else cache
+
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val regex = pattern(input2.asInstanceOf[UTF8String].toString)
     if(regex == null) {
       null
     } else {
@@ -65,7 +63,7 @@ trait StringRegexExpression extends Expression
     }
   }
 
-  override def sql: String = s"${str.sql} ${prettyName.toUpperCase(Locale.ROOT)} ${pattern.sql}"
+  override def sql: String = s"${left.sql} ${prettyName.toUpperCase(Locale.ROOT)} ${right.sql}"
 }
 
 // scalastyle:off line.contains.tab
@@ -110,65 +108,46 @@ trait StringRegexExpression extends Expression
       true
       > SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SystemDrive/%//Users%' ESCAPE '/';
       true
-      > SELECT _FUNC_('_Apache Spark_', '__%Spark__', '_');
-      true
   """,
   note = """
     Use RLIKE to match with standard regular expressions.
   """,
   since = "1.0.0")
 // scalastyle:on line.contains.tab
-case class Like(str: Expression, pattern: Expression, escape: Expression)
-  extends TernaryExpression with StringRegexExpression {
-
-  def this(str: Expression, pattern: Expression) = this(str, pattern, Literal("\\"))
-
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType)
-  override def children: Seq[Expression] = Seq(str, pattern, escape)
+case class Like(left: Expression, right: Expression, escapeChar: Char)
+  extends StringRegexExpression {
 
-  private lazy val escapeChar: Char = if (escape.foldable) {
-    escape.eval() match {
-      case s: UTF8String if s != null && s.numChars() == 1 => s.toString.charAt(0)
-      case s => throw new AnalysisException(
-        s"The 'escape' parameter must be a string literal of one char but it is $s.")
-    }
-  } else {
-    throw new AnalysisException("The 'escape' parameter must be a string literal.")
-  }
+  def this(left: Expression, right: Expression) = this(left, right, '\\')
 
   override def escape(v: String): String = StringUtils.escapeLikeRegex(v, escapeChar)
 
   override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
 
   override def toString: String = escapeChar match {
-    case '\\' => s"$str LIKE $pattern"
-    case c => s"$str LIKE $pattern ESCAPE '$c'"
-  }
-
-  protected override def nullSafeEval(input1: Any, input2: Any, input3: Any): Any = {
-    nullSafeMatch(input1, input2)
+    case '\\' => s"$left LIKE $right"
+    case c => s"$left LIKE $right ESCAPE '$c'"
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val patternClass = classOf[Pattern].getName
     val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex"
 
-    if (pattern.foldable) {
-      val patternVal = pattern.eval()
-      if (patternVal != null) {
+    if (right.foldable) {
+      val rVal = right.eval()
+      if (rVal != null) {
         val regexStr =
-          StringEscapeUtils.escapeJava(escape(patternVal.asInstanceOf[UTF8String].toString()))
-        val compiledPattern = ctx.addMutableState(patternClass, "compiledPattern",
+          StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
+        val pattern = ctx.addMutableState(patternClass, "patternLike",
           v => s"""$v = $patternClass.compile("$regexStr");""")
 
         // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
-        val eval = str.genCode(ctx)
+        val eval = left.genCode(ctx)
         ev.copy(code = code"""
           ${eval.code}
           boolean ${ev.isNull} = ${eval.isNull};
           ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.value} = $compiledPattern.matcher(${eval.value}.toString()).matches();
+            ${ev.value} = $pattern.matcher(${eval.value}.toString()).matches();
           }
         """)
       } else {
@@ -178,8 +157,8 @@ case class Like(str: Expression, pattern: Expression, escape: Expression)
         """)
       }
     } else {
-      val patternStr = ctx.freshName("patternStr")
-      val compiledPattern = ctx.freshName("compiledPattern")
+      val pattern = ctx.freshName("pattern")
+      val rightStr = ctx.freshName("rightStr")
       // We need double escape to avoid org.codehaus.commons.compiler.CompileException.
       // '\\' will cause exception 'Single quote must be backslash-escaped in character literal'.
       // '\"' will cause exception 'Line break in literal not allowed'.
@@ -188,12 +167,12 @@ case class Like(str: Expression, pattern: Expression, escape: Expression)
       } else {
         escapeChar
       }
-      nullSafeCodeGen(ctx, ev, (eval1, eval2, _) => {
+      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
-          String $patternStr = $eval2.toString();
-          $patternClass $compiledPattern = $patternClass.compile(
-            $escapeFunc($patternStr, '$newEscapeChar'));
-          ${ev.value} = $compiledPattern.matcher($eval1.toString()).matches();
+          String $rightStr = $eval2.toString();
+          $patternClass $pattern = $patternClass.compile(
+            $escapeFunc($rightStr, '$newEscapeChar'));
+          ${ev.value} = $pattern.matcher($eval1.toString()).matches();
         """
       })
     }
@@ -232,20 +211,12 @@ case class Like(str: Expression, pattern: Expression, escape: Expression)
   """,
   since = "1.0.0")
 // scalastyle:on line.contains.tab
-case class RLike(left: Expression, right: Expression)
-  extends BinaryExpression with StringRegexExpression {
-
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
-
-  override def str: Expression = left
-  override def pattern: Expression = right
+case class RLike(left: Expression, right: Expression) extends StringRegexExpression {
 
   override def escape(v: String): String = v
   override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
   override def toString: String = s"$left RLIKE $right"
 
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = nullSafeMatch(input1, input2)
-
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val patternClass = classOf[Pattern].getName
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 6fc65e14868e0..62e568587fcc6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1392,9 +1392,9 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
             throw new ParseException("Invalid escape string." +
               "Escape string must contains only one character.", ctx)
           }
-          str
+          str.charAt(0)
         }.getOrElse('\\')
-        invertIfNotDefined(Like(e, expression(ctx.pattern), Literal(escapeChar)))
+        invertIfNotDefined(Like(e, expression(ctx.pattern), escapeChar))
       case SqlBaseParser.RLIKE =>
         invertIfNotDefined(RLike(e, expression(ctx.pattern)))
       case SqlBaseParser.NULL if ctx.NOT != null =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 9e9d8c3e9a7c5..6012678341ccc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3560,21 +3560,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       Seq(Row(1)))
   }
 
-  test("the like function with the escape parameter") {
-    val df = Seq(("abc", "a_c", "!")).toDF("str", "pattern", "escape")
-    checkAnswer(df.selectExpr("like(str, pattern, '@')"), Row(true))
-
-    val longEscapeError = intercept[AnalysisException] {
-      df.selectExpr("like(str, pattern, '@%')").collect()
-    }.getMessage
-    assert(longEscapeError.contains("The 'escape' parameter must be a string literal of one char"))
-
-    val nonFoldableError = intercept[AnalysisException] {
-      df.selectExpr("like(str, pattern, escape)").collect()
-    }.getMessage
-    assert(nonFoldableError.contains("The 'escape' parameter must be a string literal"))
-  }
-
   test("SPARK-29462: Empty array of NullType for array function with no arguments") {
     Seq((true, StringType), (false, NullType)).foreach {
       case (arrayDefaultToString, expectedType) =>

From b25359cca3190f6a34dce3c3e49c4d2a80e88bdc Mon Sep 17 00:00:00 2001
From: herman <herman@databricks.com>
Date: Wed, 12 Feb 2020 10:48:29 +0900
Subject: [PATCH 019/185] [SPARK-30780][SQL] Empty LocalTableScan should use
 RDD without partitions

### What changes were proposed in this pull request?
This is a small follow-up for https://github.com/apache/spark/pull/27400. This PR makes an empty `LocalTableScanExec` return an `RDD` without partitions.

### Why are the changes needed?
It is a bit unexpected that the RDD contains partitions if there is not work to do. It also can save a bit of work when this is used in a more complex plan.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Added test to `SparkPlanSuite`.

Closes #27530 from hvanhovell/SPARK-30780.

Authored-by: herman <herman@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/sql/execution/LocalTableScanExec.scala     | 12 ++++++++----
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  |  2 +-
 .../apache/spark/sql/execution/SparkPlanSuite.scala  |  4 ++++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
index 1b5115f2e29a3..b452213cd6cc7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
@@ -45,10 +45,14 @@ case class LocalTableScanExec(
     }
   }
 
-  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),
-    sqlContext.sparkContext.defaultParallelism)
-
-  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)
+  @transient private lazy val rdd: RDD[InternalRow] = {
+    if (rows.isEmpty) {
+      sqlContext.sparkContext.emptyRDD
+    } else {
+      val numSlices = math.min(unsafeRows.length, sqlContext.sparkContext.defaultParallelism)
+      sqlContext.sparkContext.parallelize(unsafeRows, numSlices)
+    }
+  }
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index d2d58a83ded5d..694e576fcded4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -330,7 +330,7 @@ class DataFrameSuite extends QueryTest
       testData.select("key").coalesce(1).select("key"),
       testData.select("key").collect().toSeq)
 
-    assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 1)
+    assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 0)
   }
 
   test("convert $\"attribute name\" into unresolved attribute") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
index e3bc414516c04..56fff1107ae39 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
@@ -84,4 +84,8 @@ class SparkPlanSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-30780 empty LocalTableScan should use RDD without partitions") {
+    assert(LocalTableScanExec(Nil, Nil).execute().getNumPartitions == 0)
+  }
 }

From aa6a60530e63ab3bb8b117f8738973d1b26a2cc7 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Wed, 12 Feb 2020 10:49:46 +0900
Subject: [PATCH 020/185] [SPARK-30722][PYTHON][DOCS] Update documentation for
 Pandas UDF with Python type hints

### What changes were proposed in this pull request?

This PR targets to document the Pandas UDF redesign with type hints introduced at SPARK-28264.
Mostly self-describing; however, there are few things to note for reviewers.

1. This PR replace the existing documentation of pandas UDFs to the newer redesign to promote the Python type hints. I added some words that Spark 3.0 still keeps the compatibility though.

2. This PR proposes to name non-pandas UDFs as "Pandas Function API"

3. SCALAR_ITER become two separate sections to reduce confusion:
  - `Iterator[pd.Series]` -> `Iterator[pd.Series]`
  - `Iterator[Tuple[pd.Series, ...]]` -> `Iterator[pd.Series]`

4. I removed some examples that look overkill to me.

5. I also removed some information in the doc, that seems duplicating or too much.

### Why are the changes needed?

To document new redesign in pandas UDF.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing tests should cover.

Closes #27466 from HyukjinKwon/SPARK-30722.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 dev/sparktestsupport/modules.py        |   1 -
 docs/sql-pyspark-pandas-with-arrow.md  | 233 +++++++----
 examples/src/main/python/sql/arrow.py  | 258 ++++++------
 python/pyspark/sql/pandas/functions.py | 538 +++++++++++--------------
 python/pyspark/sql/pandas/group_ops.py |  99 ++++-
 python/pyspark/sql/pandas/map_ops.py   |   6 +-
 python/pyspark/sql/udf.py              |  16 +-
 7 files changed, 609 insertions(+), 542 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 40f2ca288d694..391e4bbe1b1f0 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -364,7 +364,6 @@ def __hash__(self):
         "pyspark.sql.avro.functions",
         "pyspark.sql.pandas.conversion",
         "pyspark.sql.pandas.map_ops",
-        "pyspark.sql.pandas.functions",
         "pyspark.sql.pandas.group_ops",
         "pyspark.sql.pandas.types",
         "pyspark.sql.pandas.serializers",
diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
index 7eb8a74547f70..92a515746b607 100644
--- a/docs/sql-pyspark-pandas-with-arrow.md
+++ b/docs/sql-pyspark-pandas-with-arrow.md
@@ -35,7 +35,7 @@ working with Arrow-enabled data.
 
 If you install PySpark using pip, then PyArrow can be brought in as an extra dependency of the
 SQL module with the command `pip install pyspark[sql]`. Otherwise, you must ensure that PyArrow
-is installed and available on all cluster nodes. The current supported version is 0.12.1.
+is installed and available on all cluster nodes. The current supported version is 0.15.1+.
 You can install using pip or conda from the conda-forge channel. See PyArrow
 [installation](https://arrow.apache.org/docs/python/install.html) for details.
 
@@ -65,132 +65,216 @@ Spark will fall back to create the DataFrame without Arrow.
 
 ## Pandas UDFs (a.k.a. Vectorized UDFs)
 
-Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer data and
-Pandas to work with the data. A Pandas UDF is defined using the keyword `pandas_udf` as a decorator
-or to wrap the function, no additional configuration is required. Currently, there are two types of
-Pandas UDF: Scalar and Grouped Map.
+Pandas UDFs are user defined functions that are executed by Spark using
+Arrow to transfer data and Pandas to work with the data, which allows vectorized operations. A Pandas
+UDF is defined using the `pandas_udf` as a decorator or to wrap the function, and no additional
+configuration is required. A Pandas UDF behaves as a regular PySpark function API in general.
 
-### Scalar
+Before Spark 3.0, Pandas UDFs used to be defined with `PandasUDFType`. From Spark 3.0
+with Python 3.6+, you can also use [Python type hints](https://www.python.org/dev/peps/pep-0484).
+Using Python type hints are preferred and using `PandasUDFType` will be deprecated in
+the future release.
 
-Scalar Pandas UDFs are used for vectorizing scalar operations. They can be used with functions such
-as `select` and `withColumn`. The Python function should take `pandas.Series` as inputs and return
-a `pandas.Series` of the same length. Internally, Spark will execute a Pandas UDF by splitting
-columns into batches and calling the function for each batch as a subset of the data, then
-concatenating the results together.
+Note that the type hint should use `pandas.Series` in all cases but there is one variant
+that `pandas.DataFrame` should be used for its input or output type hint instead when the input
+or output column is of `StructType`. The following example shows a Pandas UDF which takes long
+column, string column and struct column, and outputs a struct column. It requires the function to
+specify the type hints of `pandas.Series` and `pandas.DataFrame` as below:
 
-The following example shows how to create a scalar Pandas UDF that computes the product of 2 columns.
+<p>
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example ser_to_frame_pandas_udf python/sql/arrow.py %}
+</div>
+</div>
+</p>
+
+In the following sections, it describes the cominations of the supported type hints. For simplicity,
+`pandas.DataFrame` variant is omitted.
+
+### Series to Series
+
+The type hint can be expressed as `pandas.Series`, ... -> `pandas.Series`.
+
+By using `pandas_udf` with the function having such type hints, it creates a Pandas UDF where the given
+function takes one or more `pandas.Series` and outputs one `pandas.Series`. The output of the function should
+always be of the same length as the input. Internally, PySpark will execute a Pandas UDF by splitting
+columns into batches and calling the function for each batch as a subset of the data, then concatenating
+the results together.
+
+The following example shows how to create this Pandas UDF that computes the product of 2 columns.
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example scalar_pandas_udf python/sql/arrow.py %}
+{% include_example ser_to_ser_pandas_udf python/sql/arrow.py %}
 </div>
 </div>
 
-### Scalar Iterator
+For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)
+
+### Iterator of Series to Iterator of Series
+
+The type hint can be expressed as `Iterator[pandas.Series]` -> `Iterator[pandas.Series]`.
+
+By using `pandas_udf` with the function having such type hints, it creates a Pandas UDF where the given
+function takes an iterator of `pandas.Series` and outputs an iterator of `pandas.Series`. The output of each
+series from the function should always be of the same length as the input. In this case, the created
+Pandas UDF requires one input column when the Pandas UDF is called. To use multiple input columns,
+a different type hint is required. See Iterator of Multiple Series to Iterator of Series.
+
+It is useful when the UDF execution requires initializing some states although internally it works
+identically as Series to Series case. The pseudocode below illustrates the example.
+
+{% highlight python %}
+@pandas_udf("long")
+def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
+    # Do some expensive initialization with a state
+    state = very_expensive_initialization()
+    for x in iterator:
+        # Use that state for whole iterator.
+        yield calculate_with_state(x, state)
 
-Scalar iterator (`SCALAR_ITER`) Pandas UDF is the same as scalar Pandas UDF above except that the
-underlying Python function takes an iterator of batches as input instead of a single batch and,
-instead of returning a single output batch, it yields output batches or returns an iterator of
-output batches.
-It is useful when the UDF execution requires initializing some states, e.g., loading an machine
-learning model file to apply inference to every input batch.
+df.select(calculate("value")).show()
+{% endhighlight %}
 
-The following example shows how to create scalar iterator Pandas UDFs:
+The following example shows how to create this Pandas UDF:
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example scalar_iter_pandas_udf python/sql/arrow.py %}
+{% include_example iter_ser_to_iter_ser_pandas_udf python/sql/arrow.py %}
 </div>
 </div>
 
-### Grouped Map
-Grouped map Pandas UDFs are used with `groupBy().apply()` which implements the "split-apply-combine" pattern.
-Split-apply-combine consists of three steps:
-* Split the data into groups by using `DataFrame.groupBy`.
-* Apply a function on each group. The input and output of the function are both `pandas.DataFrame`. The
-  input data contains all the rows and columns for each group.
-* Combine the results into a new `DataFrame`.
+For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)
 
-To use `groupBy().apply()`, the user needs to define the following:
-* A Python function that defines the computation for each group.
-* A `StructType` object or a string that defines the schema of the output `DataFrame`.
+### Iterator of Multiple Series to Iterator of Series
 
-The column labels of the returned `pandas.DataFrame` must either match the field names in the
-defined output schema if specified as strings, or match the field data types by position if not
-strings, e.g. integer indices. See [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#pandas.DataFrame)
-on how to label columns when constructing a `pandas.DataFrame`.
+The type hint can be expressed as `Iterator[Tuple[pandas.Series, ...]]` -> `Iterator[pandas.Series]`.
 
-Note that all data for a group will be loaded into memory before the function is applied. This can
-lead to out of memory exceptions, especially if the group sizes are skewed. The configuration for
-[maxRecordsPerBatch](#setting-arrow-batch-size) is not applied on groups and it is up to the user
-to ensure that the grouped data will fit into the available memory.
+By using `pandas_udf` with the function having such type hints, it creates a Pandas UDF where the
+given function takes an iterator of a tuple of multiple `pandas.Series` and outputs an iterator of `pandas.Series`.
+In this case, the created pandas UDF requires multiple input columns as many as the series in the tuple
+when the Pandas UDF is called. It works identically as Iterator of Series to Iterator of Series case except the parameter difference.
 
-The following example shows how to use `groupby().apply()` to subtract the mean from each value in the group.
+The following example shows how to create this Pandas UDF:
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example grouped_map_pandas_udf python/sql/arrow.py %}
+{% include_example iter_sers_to_iter_ser_pandas_udf python/sql/arrow.py %}
 </div>
 </div>
 
-For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf) and
-[`pyspark.sql.GroupedData.apply`](api/python/pyspark.sql.html#pyspark.sql.GroupedData.apply).
+For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)
+
+### Series to Scalar
 
-### Grouped Aggregate
+The type hint can be expressed as `pandas.Series`, ... -> `Any`.
 
-Grouped aggregate Pandas UDFs are similar to Spark aggregate functions. Grouped aggregate Pandas UDFs are used with `groupBy().agg()` and
-[`pyspark.sql.Window`](api/python/pyspark.sql.html#pyspark.sql.Window). It defines an aggregation from one or more `pandas.Series`
-to a scalar value, where each `pandas.Series` represents a column within the group or window.
+By using `pandas_udf` with the function having such type hints, it creates a Pandas UDF similar
+to PySpark's aggregate functions. The given function takes `pandas.Series` and returns a scalar value.
+The return type should be a primitive data type, and the returned scalar can be either a python
+primitive type, e.g., `int` or `float` or a numpy data type, e.g., `numpy.int64` or `numpy.float64`.
+`Any` should ideally be a specific scalar type accordingly.
 
-Note that this type of UDF does not support partial aggregation and all data for a group or window will be loaded into memory. Also,
-only unbounded window is supported with Grouped aggregate Pandas UDFs currently.
+This UDF can be also used with `groupBy().agg()` and [`pyspark.sql.Window`](api/python/pyspark.sql.html#pyspark.sql.Window).
+It defines an aggregation from one or more `pandas.Series` to a scalar value, where each `pandas.Series`
+represents a column within the group or window.
 
-The following example shows how to use this type of UDF to compute mean with groupBy and window operations:
+Note that this type of UDF does not support partial aggregation and all data for a group or window
+will be loaded into memory. Also, only unbounded window is supported with Grouped aggregate Pandas
+UDFs currently. The following example shows how to use this type of UDF to compute mean with a group-by
+and window operations:
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example grouped_agg_pandas_udf python/sql/arrow.py %}
+{% include_example ser_to_scalar_pandas_udf python/sql/arrow.py %}
 </div>
 </div>
 
 For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)
 
 
-### Map Iterator
+## Pandas Function APIs
+
+Pandas function APIs can directly apply a Python native function against the whole DataFrame by
+using Pandas instances. Internally it works similarly with Pandas UDFs by Spark using Arrow to transfer
+data and Pandas to work with the data, which allows vectorized operations. A Pandas function API behaves
+as a regular API under PySpark `DataFrame` in general.
+
+From Spark 3.0, Grouped map pandas UDF is now categorized as a separate Pandas Function API,
+`DataFrame.groupby().applyInPandas()`. It is still possible to use it with `PandasUDFType`
+and `DataFrame.groupby().apply()` as it was; however, it is preferred to use
+`DataFrame.groupby().applyInPandas()` directly. Using `PandasUDFType` will be deprecated
+in the future.
+
+### Grouped Map
+
+Grouped map operations with Pandas instances are supported by `DataFrame.groupby().applyInPandas()`
+which requires a Python function that takes a `pandas.DataFrame` and return another `pandas.DataFrame`.
+It maps each group to each `pandas.DataFrame` in the Python function.
+
+This API implements the "split-apply-combine" pattern which consists of three steps:
+* Split the data into groups by using `DataFrame.groupBy`.
+* Apply a function on each group. The input and output of the function are both `pandas.DataFrame`. The
+  input data contains all the rows and columns for each group.
+* Combine the results into a new PySpark `DataFrame`.
 
-Map iterator Pandas UDFs are used to transform data with an iterator of batches. Map iterator
-Pandas UDFs can be used with 
-[`pyspark.sql.DataFrame.mapInPandas`](api/python/pyspark.sql.html#pyspark.sql.DataFrame.mapInPandas).
-It defines a map function that transforms an iterator of `pandas.DataFrame` to another.
+To use `groupBy().applyInPandas()`, the user needs to define the following:
+* A Python function that defines the computation for each group.
+* A `StructType` object or a string that defines the schema of the output PySpark `DataFrame`.
+
+The column labels of the returned `pandas.DataFrame` must either match the field names in the
+defined output schema if specified as strings, or match the field data types by position if not
+strings, e.g. integer indices. See [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#pandas.DataFrame)
+on how to label columns when constructing a `pandas.DataFrame`.
 
-It can return the output of arbitrary length in contrast to the scalar Pandas UDF. It maps an iterator of `pandas.DataFrame`s,
-that represents the current `DataFrame`, using the map iterator UDF and returns the result as a `DataFrame`.
+Note that all data for a group will be loaded into memory before the function is applied. This can
+lead to out of memory exceptions, especially if the group sizes are skewed. The configuration for
+[maxRecordsPerBatch](#setting-arrow-batch-size) is not applied on groups and it is up to the user
+to ensure that the grouped data will fit into the available memory.
 
-The following example shows how to create map iterator Pandas UDFs:
+The following example shows how to use `groupby().applyInPandas()` to subtract the mean from each value
+in the group.
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example map_iter_pandas_udf python/sql/arrow.py %}
+{% include_example grouped_apply_in_pandas python/sql/arrow.py %}
 </div>
 </div>
 
-For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf) and
-[`pyspark.sql.DataFrame.mapsInPandas`](api/python/pyspark.sql.html#pyspark.sql.DataFrame.mapInPandas).
+For detailed usage, please see [`pyspark.sql.GroupedData.applyInPandas`](api/python/pyspark.sql.html#pyspark.sql.GroupedData.applyInPandas).
 
+### Map
+
+Map operations with Pandas instances are supported by `DataFrame.mapInPandas()` which maps an iterator
+of `pandas.DataFrame`s to another iterator of `pandas.DataFrame`s that represents the current
+PySpark `DataFrame` and returns the result as a PySpark `DataFrame`. The functions takes and outputs
+an iterator of `pandas.DataFrame`. It can return the output of arbitrary length in contrast to some
+Pandas UDFs although internally it works similarly with Series to Series Pandas UDF.
+
+The following example shows how to use `mapInPandas()`:
+
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example map_in_pandas python/sql/arrow.py %}
+</div>
+</div>
 
-### Cogrouped Map
+For detailed usage, please see [`pyspark.sql.DataFrame.mapsInPandas`](api/python/pyspark.sql.html#pyspark.sql.DataFrame.mapInPandas).
 
-Cogrouped map Pandas UDFs allow two DataFrames to be cogrouped by a common key and then a python function applied to
-each cogroup.  They are used with `groupBy().cogroup().apply()` which consists of the following steps:
+### Co-grouped Map
 
+Co-grouped map operations with Pandas instances are supported by `DataFrame.groupby().cogroup().applyInPandas()` which
+allows two PySpark `DataFrame`s to be cogrouped by a common key and then a Python function applied to each
+cogroup. It consists of the following steps:
 * Shuffle the data such that the groups of each dataframe which share a key are cogrouped together.
-* Apply a function to each cogroup.  The input of the function is two `pandas.DataFrame` (with an optional Tuple
-representing the key).  The output of the function is a `pandas.DataFrame`.
-* Combine the pandas.DataFrames from all groups into a new `DataFrame`. 
+* Apply a function to each cogroup. The input of the function is two `pandas.DataFrame` (with an optional tuple
+representing the key). The output of the function is a `pandas.DataFrame`.
+* Combine the `pandas.DataFrame`s from all groups into a new PySpark `DataFrame`. 
 
-To use `groupBy().cogroup().apply()`, the user needs to define the following:
+To use `groupBy().cogroup().applyInPandas()`, the user needs to define the following:
 * A Python function that defines the computation for each cogroup.
-* A `StructType` object or a string that defines the schema of the output `DataFrame`.
+* A `StructType` object or a string that defines the schema of the output PySpark `DataFrame`.
 
 The column labels of the returned `pandas.DataFrame` must either match the field names in the
 defined output schema if specified as strings, or match the field data types by position if not
@@ -201,16 +285,15 @@ Note that all data for a cogroup will be loaded into memory before the function
 memory exceptions, especially if the group sizes are skewed. The configuration for [maxRecordsPerBatch](#setting-arrow-batch-size)
 is not applied and it is up to the user to ensure that the cogrouped data will fit into the available memory.
 
-The following example shows how to use `groupby().cogroup().apply()` to perform an asof join between two datasets.
+The following example shows how to use `groupby().cogroup().applyInPandas()` to perform an asof join between two datasets.
 
 <div class="codetabs">
 <div data-lang="python" markdown="1">
-{% include_example cogrouped_map_pandas_udf python/sql/arrow.py %}
+{% include_example cogrouped_apply_in_pandas python/sql/arrow.py %}
 </div>
 </div>
 
-For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf) and
-[`pyspark.sql.CoGroupedData.apply`](api/python/pyspark.sql.html#pyspark.sql.CoGroupedData.apply).
+For detailed usage, please see [`pyspark.sql.PandasCogroupedOps.applyInPandas()`](api/python/pyspark.sql.html#pyspark.sql.PandasCogroupedOps.applyInPandas).
 
 
 ## Usage Notes
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
index 1c983172d36ef..b7d8467172fab 100644
--- a/examples/src/main/python/sql/arrow.py
+++ b/examples/src/main/python/sql/arrow.py
@@ -23,12 +23,19 @@
 
 from __future__ import print_function
 
+import sys
+
 from pyspark.sql import SparkSession
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 
 require_minimum_pandas_version()
 require_minimum_pyarrow_version()
 
+if sys.version_info < (3, 6):
+    raise Exception(
+        "Running this example file requires Python 3.6+; however, "
+        "your Python version was:\n %s" % sys.version)
+
 
 def dataframe_with_arrow_example(spark):
     # $example on:dataframe_with_arrow$
@@ -50,15 +57,45 @@ def dataframe_with_arrow_example(spark):
     print("Pandas DataFrame result statistics:\n%s\n" % str(result_pdf.describe()))
 
 
-def scalar_pandas_udf_example(spark):
-    # $example on:scalar_pandas_udf$
+def ser_to_frame_pandas_udf_example(spark):
+    # $example on:ser_to_frame_pandas_udf$
+    import pandas as pd
+
+    from pyspark.sql.functions import pandas_udf
+
+    @pandas_udf("col1 string, col2 long")
+    def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
+        s3['col2'] = s1 + s2.str.len()
+        return s3
+
+    # Create a Spark DataFrame that has three columns including a sturct column.
+    df = spark.createDataFrame(
+        [[1, "a string", ("a nested string",)]],
+        "long_col long, string_col string, struct_col struct<col1:string>")
+
+    df.printSchema()
+    # root
+    # |-- long_column: long (nullable = true)
+    # |-- string_column: string (nullable = true)
+    # |-- struct_column: struct (nullable = true)
+    # |    |-- col1: string (nullable = true)
+
+    df.select(func("long_col", "string_col", "struct_col")).printSchema()
+    # |-- func(long_col, string_col, struct_col): struct (nullable = true)
+    # |    |-- col1: string (nullable = true)
+    # |    |-- col2: long (nullable = true)
+    # $example off:ser_to_frame_pandas_udf$$
+
+
+def ser_to_ser_pandas_udf_example(spark):
+    # $example on:ser_to_ser_pandas_udf$
     import pandas as pd
 
     from pyspark.sql.functions import col, pandas_udf
     from pyspark.sql.types import LongType
 
     # Declare the function and create the UDF
-    def multiply_func(a, b):
+    def multiply_func(a: pd.Series, b: pd.Series) -> pd.Series:
         return a * b
 
     multiply = pandas_udf(multiply_func, returnType=LongType())
@@ -83,26 +120,27 @@ def multiply_func(a, b):
     # |                  4|
     # |                  9|
     # +-------------------+
-    # $example off:scalar_pandas_udf$
+    # $example off:ser_to_ser_pandas_udf$
 
 
-def scalar_iter_pandas_udf_example(spark):
-    # $example on:scalar_iter_pandas_udf$
+def iter_ser_to_iter_ser_pandas_udf_example(spark):
+    # $example on:iter_ser_to_iter_ser_pandas_udf$
+    from typing import Iterator
+
     import pandas as pd
 
-    from pyspark.sql.functions import col, pandas_udf, struct, PandasUDFType
+    from pyspark.sql.functions import pandas_udf
 
     pdf = pd.DataFrame([1, 2, 3], columns=["x"])
     df = spark.createDataFrame(pdf)
 
-    # When the UDF is called with a single column that is not StructType,
-    # the input to the underlying function is an iterator of pd.Series.
-    @pandas_udf("long", PandasUDFType.SCALAR_ITER)
-    def plus_one(batch_iter):
-        for x in batch_iter:
+    # Declare the function and create the UDF
+    @pandas_udf("long")
+    def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
+        for x in iterator:
             yield x + 1
 
-    df.select(plus_one(col("x"))).show()
+    df.select(plus_one("x")).show()
     # +-----------+
     # |plus_one(x)|
     # +-----------+
@@ -110,15 +148,28 @@ def plus_one(batch_iter):
     # |          3|
     # |          4|
     # +-----------+
+    # $example off:iter_ser_to_iter_ser_pandas_udf$
+
+
+def iter_sers_to_iter_ser_pandas_udf_example(spark):
+    # $example on:iter_sers_to_iter_ser_pandas_udf$
+    from typing import Iterator, Tuple
+
+    import pandas as pd
 
-    # When the UDF is called with more than one columns,
-    # the input to the underlying function is an iterator of pd.Series tuple.
-    @pandas_udf("long", PandasUDFType.SCALAR_ITER)
-    def multiply_two_cols(batch_iter):
-        for a, b in batch_iter:
+    from pyspark.sql.functions import pandas_udf
+
+    pdf = pd.DataFrame([1, 2, 3], columns=["x"])
+    df = spark.createDataFrame(pdf)
+
+    # Declare the function and create the UDF
+    @pandas_udf("long")
+    def multiply_two_cols(
+            iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
+        for a, b in iterator:
             yield a * b
 
-    df.select(multiply_two_cols(col("x"), col("x"))).show()
+    df.select(multiply_two_cols("x", "x")).show()
     # +-----------------------+
     # |multiply_two_cols(x, x)|
     # +-----------------------+
@@ -126,92 +177,32 @@ def multiply_two_cols(batch_iter):
     # |                      4|
     # |                      9|
     # +-----------------------+
+    # $example off:iter_sers_to_iter_ser_pandas_udf$
 
-    # When the UDF is called with a single column that is StructType,
-    # the input to the underlying function is an iterator of pd.DataFrame.
-    @pandas_udf("long", PandasUDFType.SCALAR_ITER)
-    def multiply_two_nested_cols(pdf_iter):
-        for pdf in pdf_iter:
-            yield pdf["a"] * pdf["b"]
-
-    df.select(
-        multiply_two_nested_cols(
-            struct(col("x").alias("a"), col("x").alias("b"))
-        ).alias("y")
-    ).show()
-    # +---+
-    # |  y|
-    # +---+
-    # |  1|
-    # |  4|
-    # |  9|
-    # +---+
-
-    # In the UDF, you can initialize some states before processing batches.
-    # Wrap your code with try/finally or use context managers to ensure
-    # the release of resources at the end.
-    y_bc = spark.sparkContext.broadcast(1)
-
-    @pandas_udf("long", PandasUDFType.SCALAR_ITER)
-    def plus_y(batch_iter):
-        y = y_bc.value  # initialize states
-        try:
-            for x in batch_iter:
-                yield x + y
-        finally:
-            pass  # release resources here, if any
-
-    df.select(plus_y(col("x"))).show()
-    # +---------+
-    # |plus_y(x)|
-    # +---------+
-    # |        2|
-    # |        3|
-    # |        4|
-    # +---------+
-    # $example off:scalar_iter_pandas_udf$
-
-
-def grouped_map_pandas_udf_example(spark):
-    # $example on:grouped_map_pandas_udf$
-    from pyspark.sql.functions import pandas_udf, PandasUDFType
-
-    df = spark.createDataFrame(
-        [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-        ("id", "v"))
-
-    @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)
-    def subtract_mean(pdf):
-        # pdf is a pandas.DataFrame
-        v = pdf.v
-        return pdf.assign(v=v - v.mean())
-
-    df.groupby("id").apply(subtract_mean).show()
-    # +---+----+
-    # | id|   v|
-    # +---+----+
-    # |  1|-0.5|
-    # |  1| 0.5|
-    # |  2|-3.0|
-    # |  2|-1.0|
-    # |  2| 4.0|
-    # +---+----+
-    # $example off:grouped_map_pandas_udf$
 
+def ser_to_scalar_pandas_udf_example(spark):
+    # $example on:ser_to_scalar_pandas_udf$
+    import pandas as pd
 
-def grouped_agg_pandas_udf_example(spark):
-    # $example on:grouped_agg_pandas_udf$
-    from pyspark.sql.functions import pandas_udf, PandasUDFType
+    from pyspark.sql.functions import pandas_udf
     from pyspark.sql import Window
 
     df = spark.createDataFrame(
         [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
         ("id", "v"))
 
-    @pandas_udf("double", PandasUDFType.GROUPED_AGG)
-    def mean_udf(v):
+    # Declare the function and create the UDF
+    @pandas_udf("double")
+    def mean_udf(v: pd.Series) -> float:
         return v.mean()
 
+    df.select(mean_udf(df['v'])).show()
+    # +-----------+
+    # |mean_udf(v)|
+    # +-----------+
+    # |        4.2|
+    # +-----------+
+
     df.groupby("id").agg(mean_udf(df['v'])).show()
     # +---+-----------+
     # | id|mean_udf(v)|
@@ -233,37 +224,54 @@ def mean_udf(v):
     # |  2| 5.0|   6.0|
     # |  2|10.0|   6.0|
     # +---+----+------+
-    # $example off:grouped_agg_pandas_udf$
+    # $example off:ser_to_scalar_pandas_udf$
 
 
-def map_iter_pandas_udf_example(spark):
-    # $example on:map_iter_pandas_udf$
-    import pandas as pd
+def grouped_apply_in_pandas_example(spark):
+    # $example on:grouped_apply_in_pandas$
+    df = spark.createDataFrame(
+        [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+        ("id", "v"))
 
-    from pyspark.sql.functions import pandas_udf, PandasUDFType
+    def subtract_mean(pdf):
+        # pdf is a pandas.DataFrame
+        v = pdf.v
+        return pdf.assign(v=v - v.mean())
+
+    df.groupby("id").applyInPandas(subtract_mean, schema="id long, v double").show()
+    # +---+----+
+    # | id|   v|
+    # +---+----+
+    # |  1|-0.5|
+    # |  1| 0.5|
+    # |  2|-3.0|
+    # |  2|-1.0|
+    # |  2| 4.0|
+    # +---+----+
+    # $example off:grouped_apply_in_pandas$
 
+
+def map_in_pandas_example(spark):
+    # $example on:map_in_pandas$
     df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
 
-    @pandas_udf(df.schema, PandasUDFType.MAP_ITER)
-    def filter_func(batch_iter):
-        for pdf in batch_iter:
+    def filter_func(iterator):
+        for pdf in iterator:
             yield pdf[pdf.id == 1]
 
-    df.mapInPandas(filter_func).show()
+    df.mapInPandas(filter_func, schema=df.schema).show()
     # +---+---+
     # | id|age|
     # +---+---+
     # |  1| 21|
     # +---+---+
-    # $example off:map_iter_pandas_udf$
+    # $example off:map_in_pandas$
 
 
-def cogrouped_map_pandas_udf_example(spark):
-    # $example on:cogrouped_map_pandas_udf$
+def cogrouped_apply_in_pandas_example(spark):
+    # $example on:cogrouped_apply_in_pandas$
     import pandas as pd
 
-    from pyspark.sql.functions import pandas_udf, PandasUDFType
-
     df1 = spark.createDataFrame(
         [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
         ("time", "id", "v1"))
@@ -272,11 +280,11 @@ def cogrouped_map_pandas_udf_example(spark):
         [(20000101, 1, "x"), (20000101, 2, "y")],
         ("time", "id", "v2"))
 
-    @pandas_udf("time int, id int, v1 double, v2 string", PandasUDFType.COGROUPED_MAP)
     def asof_join(l, r):
         return pd.merge_asof(l, r, on="time", by="id")
 
-    df1.groupby("id").cogroup(df2.groupby("id")).apply(asof_join).show()
+    df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
+        asof_join, schema="time int, id int, v1 double, v2 string").show()
     # +--------+---+---+---+
     # |    time| id| v1| v2|
     # +--------+---+---+---+
@@ -285,7 +293,7 @@ def asof_join(l, r):
     # |20000101|  2|2.0|  y|
     # |20000102|  2|4.0|  y|
     # +--------+---+---+---+
-    # $example off:cogrouped_map_pandas_udf$
+    # $example off:cogrouped_apply_in_pandas$
 
 
 if __name__ == "__main__":
@@ -296,17 +304,21 @@ def asof_join(l, r):
 
     print("Running Pandas to/from conversion example")
     dataframe_with_arrow_example(spark)
-    print("Running pandas_udf scalar example")
-    scalar_pandas_udf_example(spark)
-    print("Running pandas_udf scalar iterator example")
-    scalar_iter_pandas_udf_example(spark)
-    print("Running pandas_udf grouped map example")
-    grouped_map_pandas_udf_example(spark)
-    print("Running pandas_udf grouped agg example")
-    grouped_agg_pandas_udf_example(spark)
-    print("Running pandas_udf map iterator example")
-    map_iter_pandas_udf_example(spark)
-    print("Running pandas_udf cogrouped map example")
-    cogrouped_map_pandas_udf_example(spark)
+    print("Running pandas_udf example: Series to Frame")
+    ser_to_frame_pandas_udf_example(spark)
+    print("Running pandas_udf example: Series to Series")
+    ser_to_ser_pandas_udf_example(spark)
+    print("Running pandas_udf example: Iterator of Series to Iterator of Seires")
+    iter_ser_to_iter_ser_pandas_udf_example(spark)
+    print("Running pandas_udf example: Iterator of Multiple Series to Iterator of Series")
+    iter_sers_to_iter_ser_pandas_udf_example(spark)
+    print("Running pandas_udf example: Series to Scalar")
+    ser_to_scalar_pandas_udf_example(spark)
+    print("Running pandas function example: Grouped Map")
+    grouped_apply_in_pandas_example(spark)
+    print("Running pandas function example: Map")
+    map_in_pandas_example(spark)
+    print("Running pandas function example: Co-grouped Map")
+    cogrouped_apply_in_pandas_example(spark)
 
     spark.stop()
diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
index 30602789a33a9..31aa321bf5826 100644
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@@ -43,303 +43,228 @@ class PandasUDFType(object):
 @since(2.3)
 def pandas_udf(f=None, returnType=None, functionType=None):
     """
-    Creates a vectorized user defined function (UDF).
+    Creates a pandas user defined function (a.k.a. vectorized user defined function).
+
+    Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer
+    data and Pandas to work with the data, which allows vectorized operations. A Pandas UDF
+    is defined using the `pandas_udf` as a decorator or to wrap the function, and no
+    additional configuration is required. A Pandas UDF behaves as a regular PySpark function
+    API in general.
 
     :param f: user-defined function. A python function if used as a standalone function
     :param returnType: the return type of the user-defined function. The value can be either a
         :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
     :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
-                         Default: SCALAR.
-
-    .. seealso:: :meth:`pyspark.sql.DataFrame.mapInPandas`
-    .. seealso:: :meth:`pyspark.sql.GroupedData.applyInPandas`
-    .. seealso:: :meth:`pyspark.sql.PandasCogroupedOps.applyInPandas`
-
-    The function type of the UDF can be one of the following:
-
-    1. SCALAR
-
-       A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`.
-       The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`.
-       If the return type is :class:`StructType`, the returned value should be a `pandas.DataFrame`.
-
-       :class:`MapType`, nested :class:`StructType` are currently not supported as output types.
-
-       Scalar UDFs can be used with :meth:`pyspark.sql.DataFrame.withColumn` and
-       :meth:`pyspark.sql.DataFrame.select`.
-
-       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-       >>> from pyspark.sql.types import IntegerType, StringType
-       >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())  # doctest: +SKIP
-       >>> @pandas_udf(StringType())  # doctest: +SKIP
-       ... def to_upper(s):
-       ...     return s.str.upper()
-       ...
-       >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
-       ... def add_one(x):
-       ...     return x + 1
-       ...
-       >>> df = spark.createDataFrame([(1, "John Doe", 21)],
-       ...                            ("id", "name", "age"))  # doctest: +SKIP
-       >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
-       ...     .show()  # doctest: +SKIP
-       +----------+--------------+------------+
-       |slen(name)|to_upper(name)|add_one(age)|
-       +----------+--------------+------------+
-       |         8|      JOHN DOE|          22|
-       +----------+--------------+------------+
-       >>> @pandas_udf("first string, last string")  # doctest: +SKIP
-       ... def split_expand(n):
-       ...     return n.str.split(expand=True)
-       >>> df.select(split_expand("name")).show()  # doctest: +SKIP
-       +------------------+
-       |split_expand(name)|
-       +------------------+
-       |       [John, Doe]|
-       +------------------+
-
-       .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
-           column, but is the length of an internal batch used for each call to the function.
-           Therefore, this can be used, for example, to ensure the length of each returned
-           `pandas.Series`, and can not be used as the column length.
-
-    2. SCALAR_ITER
-
-       A scalar iterator UDF is semantically the same as the scalar Pandas UDF above except that the
-       wrapped Python function takes an iterator of batches as input instead of a single batch and,
-       instead of returning a single output batch, it yields output batches or explicitly returns an
-       generator or an iterator of output batches.
-       It is useful when the UDF execution requires initializing some state, e.g., loading a machine
-       learning model file to apply inference to every input batch.
-
-       .. note:: It is not guaranteed that one invocation of a scalar iterator UDF will process all
-           batches from one partition, although it is currently implemented this way.
-           Your code shall not rely on this behavior because it might change in the future for
-           further optimization, e.g., one invocation processes multiple partitions.
-
-       Scalar iterator UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and
-       :meth:`pyspark.sql.DataFrame.select`.
-
-       >>> import pandas as pd  # doctest: +SKIP
-       >>> from pyspark.sql.functions import col, pandas_udf, struct, PandasUDFType
-       >>> pdf = pd.DataFrame([1, 2, 3], columns=["x"])  # doctest: +SKIP
-       >>> df = spark.createDataFrame(pdf)  # doctest: +SKIP
-
-       When the UDF is called with a single column that is not `StructType`, the input to the
-       underlying function is an iterator of `pd.Series`.
-
-       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
-       ... def plus_one(batch_iter):
-       ...     for x in batch_iter:
-       ...         yield x + 1
-       ...
-       >>> df.select(plus_one(col("x"))).show()  # doctest: +SKIP
-       +-----------+
-       |plus_one(x)|
-       +-----------+
-       |          2|
-       |          3|
-       |          4|
-       +-----------+
-
-       When the UDF is called with more than one columns, the input to the underlying function is an
-       iterator of `pd.Series` tuple.
-
-       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
-       ... def multiply_two_cols(batch_iter):
-       ...     for a, b in batch_iter:
-       ...         yield a * b
-       ...
-       >>> df.select(multiply_two_cols(col("x"), col("x"))).show()  # doctest: +SKIP
-       +-----------------------+
-       |multiply_two_cols(x, x)|
-       +-----------------------+
-       |                      1|
-       |                      4|
-       |                      9|
-       +-----------------------+
-
-       When the UDF is called with a single column that is `StructType`, the input to the underlying
-       function is an iterator of `pd.DataFrame`.
-
-       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
-       ... def multiply_two_nested_cols(pdf_iter):
-       ...    for pdf in pdf_iter:
-       ...        yield pdf["a"] * pdf["b"]
-       ...
-       >>> df.select(
-       ...     multiply_two_nested_cols(
-       ...         struct(col("x").alias("a"), col("x").alias("b"))
-       ...     ).alias("y")
-       ... ).show()  # doctest: +SKIP
-       +---+
-       |  y|
-       +---+
-       |  1|
-       |  4|
-       |  9|
-       +---+
-
-       In the UDF, you can initialize some states before processing batches, wrap your code with
-       `try ... finally ...` or use context managers to ensure the release of resources at the end
-       or in case of early termination.
-
-       >>> y_bc = spark.sparkContext.broadcast(1)  # doctest: +SKIP
-       >>> @pandas_udf("long", PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
-       ... def plus_y(batch_iter):
-       ...     y = y_bc.value  # initialize some state
-       ...     try:
-       ...         for x in batch_iter:
-       ...             yield x + y
-       ...     finally:
-       ...         pass  # release resources here, if any
-       ...
-       >>> df.select(plus_y(col("x"))).show()  # doctest: +SKIP
-       +---------+
-       |plus_y(x)|
-       +---------+
-       |        2|
-       |        3|
-       |        4|
-       +---------+
-
-    3. GROUPED_MAP
-
-       A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
-       The returnType should be a :class:`StructType` describing the schema of the returned
-       `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
-       the field names in the defined returnType schema if specified as strings, or match the
-       field data types by position if not strings, e.g. integer indices.
-       The length of the returned `pandas.DataFrame` can be arbitrary.
-
-       Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.
-
-       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-       >>> df = spark.createDataFrame(
-       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-       ...     ("id", "v"))  # doctest: +SKIP
-       >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
-       ... def normalize(pdf):
-       ...     v = pdf.v
-       ...     return pdf.assign(v=(v - v.mean()) / v.std())
-       >>> df.groupby("id").apply(normalize).show()  # doctest: +SKIP
-       +---+-------------------+
-       | id|                  v|
-       +---+-------------------+
-       |  1|-0.7071067811865475|
-       |  1| 0.7071067811865475|
-       |  2|-0.8320502943378437|
-       |  2|-0.2773500981126146|
-       |  2| 1.1094003924504583|
-       +---+-------------------+
-
-       Alternatively, the user can define a function that takes two arguments.
-       In this case, the grouping key(s) will be passed as the first argument and the data will
-       be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy
-       data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in
-       as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.
-       This is useful when the user does not want to hardcode grouping key(s) in the function.
-
-       >>> import pandas as pd  # doctest: +SKIP
-       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-       >>> df = spark.createDataFrame(
-       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-       ...     ("id", "v"))  # doctest: +SKIP
-       >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
-       ... def mean_udf(key, pdf):
-       ...     # key is a tuple of one numpy.int64, which is the value
-       ...     # of 'id' for the current group
-       ...     return pd.DataFrame([key + (pdf.v.mean(),)])
-       >>> df.groupby('id').apply(mean_udf).show()  # doctest: +SKIP
-       +---+---+
-       | id|  v|
-       +---+---+
-       |  1|1.5|
-       |  2|6.0|
-       +---+---+
-       >>> @pandas_udf(
-       ...    "id long, `ceil(v / 2)` long, v double",
-       ...    PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
-       >>> def sum_udf(key, pdf):
-       ...     # key is a tuple of two numpy.int64s, which is the values
-       ...     # of 'id' and 'ceil(df.v / 2)' for the current group
-       ...     return pd.DataFrame([key + (pdf.v.sum(),)])
-       >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show()  # doctest: +SKIP
-       +---+-----------+----+
-       | id|ceil(v / 2)|   v|
-       +---+-----------+----+
-       |  2|          5|10.0|
-       |  1|          1| 3.0|
-       |  2|          3| 5.0|
-       |  2|          2| 3.0|
-       +---+-----------+----+
-
-       .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
-           recommended to explicitly index the columns by name to ensure the positions are correct,
-           or alternatively use an `OrderedDict`.
-           For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
-           `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.
-
-       .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
-
-    4. GROUPED_AGG
-
-       A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar
-       The `returnType` should be a primitive data type, e.g., :class:`DoubleType`.
-       The returned scalar can be either a python primitive type, e.g., `int` or `float`
-       or a numpy data type, e.g., `numpy.int64` or `numpy.float64`.
-
-       :class:`MapType` and :class:`StructType` are currently not supported as output types.
-
-       Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` and
-       :class:`pyspark.sql.Window`
-
-       This example shows using grouped aggregated UDFs with groupby:
-
-       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-       >>> df = spark.createDataFrame(
-       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-       ...     ("id", "v"))
-       >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
-       ... def mean_udf(v):
-       ...     return v.mean()
-       >>> df.groupby("id").agg(mean_udf(df['v'])).show()  # doctest: +SKIP
-       +---+-----------+
-       | id|mean_udf(v)|
-       +---+-----------+
-       |  1|        1.5|
-       |  2|        6.0|
-       +---+-----------+
-
-       This example shows using grouped aggregated UDFs as window functions.
-
-       >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-       >>> from pyspark.sql import Window
-       >>> df = spark.createDataFrame(
-       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-       ...     ("id", "v"))
-       >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
-       ... def mean_udf(v):
-       ...     return v.mean()
-       >>> w = (Window.partitionBy('id')
-       ...            .orderBy('v')
-       ...            .rowsBetween(-1, 0))
-       >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()  # doctest: +SKIP
-       +---+----+------+
-       | id|   v|mean_v|
-       +---+----+------+
-       |  1| 1.0|   1.0|
-       |  1| 2.0|   1.5|
-       |  2| 3.0|   3.0|
-       |  2| 5.0|   4.0|
-       |  2|10.0|   7.5|
-       +---+----+------+
-
-       .. note:: For performance reasons, the input series to window functions are not copied.
+        Default: SCALAR.
+
+        .. note:: This parameter exists for compatibility. Using Python type hints is encouraged.
+
+    In order to use this API, customarily the below are imported:
+
+    >>> import pandas as pd
+    >>> from pyspark.sql.functions import pandas_udf
+
+    From Spark 3.0 with Python 3.6+, `Python type hints <https://www.python.org/dev/peps/pep-0484>`_
+    detect the function types as below:
+
+    >>> @pandas_udf(IntegerType())
+    ... def slen(s: pd.Series) -> pd.Series:
+    ...     return s.str.len()
+
+    Prior to Spark 3.0, the pandas UDF used `functionType` to decide the execution type as below:
+
+    >>> from pyspark.sql.functions import PandasUDFType
+    >>> from pyspark.sql.types import IntegerType
+    >>> @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
+    ... def slen(s):
+    ...     return s.str.len()
+
+    It is preferred to specify type hints for the pandas UDF instead of specifying pandas UDF
+    type via `functionType` which will be deprecated in the future releases.
+
+    Note that the type hint should use `pandas.Series` in all cases but there is one variant
+    that `pandas.DataFrame` should be used for its input or output type hint instead when the input
+    or output column is of :class:`pyspark.sql.types.StructType`. The following example shows
+    a Pandas UDF which takes long column, string column and struct column, and outputs a struct
+    column. It requires the function to specify the type hints of `pandas.Series` and
+    `pandas.DataFrame` as below:
+
+    >>> @pandas_udf("col1 string, col2 long")
+    >>> def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
+    ...     s3['col2'] = s1 + s2.str.len()
+    ...     return s3
+    ...
+    >>> # Create a Spark DataFrame that has three columns including a sturct column.
+    ... df = spark.createDataFrame(
+    ...     [[1, "a string", ("a nested string",)]],
+    ...     "long_col long, string_col string, struct_col struct<col1:string>")
+    >>> df.printSchema()
+    root
+    |-- long_column: long (nullable = true)
+    |-- string_column: string (nullable = true)
+    |-- struct_column: struct (nullable = true)
+    |    |-- col1: string (nullable = true)
+    >>> df.select(func("long_col", "string_col", "struct_col")).printSchema()
+    |-- func(long_col, string_col, struct_col): struct (nullable = true)
+    |    |-- col1: string (nullable = true)
+    |    |-- col2: long (nullable = true)
+
+    In the following sections, it describes the cominations of the supported type hints. For
+    simplicity, `pandas.DataFrame` variant is omitted.
+
+    * Series to Series
+        `pandas.Series`, ... -> `pandas.Series`
+
+        The function takes one or more `pandas.Series` and outputs one `pandas.Series`.
+        The output of the function should always be of the same length as the input.
+
+        >>> @pandas_udf("string")
+        ... def to_upper(s: pd.Series) -> pd.Series:
+        ...     return s.str.upper()
+        ...
+        >>> df = spark.createDataFrame([("John Doe",)], ("name",))
+        >>> df.select(to_upper("name")).show()
+        +--------------+
+        |to_upper(name)|
+        +--------------+
+        |      JOHN DOE|
+        +--------------+
+
+        >>> @pandas_udf("first string, last string")
+        ... def split_expand(s: pd.Series) -> pd.DataFrame:
+        ...     return s.str.split(expand=True)
+        ...
+        >>> df = spark.createDataFrame([("John Doe",)], ("name",))
+        >>> df.select(split_expand("name")).show()
+        +------------------+
+        |split_expand(name)|
+        +------------------+
+        |       [John, Doe]|
+        +------------------+
+
+        .. note:: The length of the input is not that of the whole input column, but is the
+            length of an internal batch used for each call to the function.
+
+    * Iterator of Series to Iterator of Series
+        `Iterator[pandas.Series]` -> `Iterator[pandas.Series]`
+
+        The function takes an iterator of `pandas.Series` and outputs an iterator of
+        `pandas.Series`. In this case, the created pandas UDF instance requires one input
+        column when this is called as a PySpark column. The output of each series from
+        the function should always be of the same length as the input.
+
+        It is useful when the UDF execution
+        requires initializing some states although internally it works identically as
+        Series to Series case. The pseudocode below illustrates the example.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            @pandas_udf("long")
+            def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
+                # Do some expensive initialization with a state
+                state = very_expensive_initialization()
+                for x in iterator:
+                    # Use that state for whole iterator.
+                    yield calculate_with_state(x, state)
+
+            df.select(calculate("value")).show()
+
+        >>> from typing import Iterator
+        >>> @pandas_udf("long")
+        ... def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
+        ...     for s in iterator:
+        ...         yield s + 1
+        ...
+        >>> df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
+        >>> df.select(plus_one(df.v)).show()
+        +-----------+
+        |plus_one(v)|
+        +-----------+
+        |          2|
+        |          3|
+        |          4|
+        +-----------+
+
+        .. note:: The length of each series is the length of a batch internally used.
+
+    * Iterator of Multiple Series to Iterator of Series
+        `Iterator[Tuple[pandas.Series, ...]]` -> `Iterator[pandas.Series]`
+
+        The function takes an iterator of a tuple of multiple `pandas.Series` and outputs an
+        iterator of `pandas.Series`. In this case, the created pandas UDF instance requires
+        input columns as many as the series when this is called as a PySpark column.
+        It works identically as Iterator of Series to Iterator of Series case except
+        the parameter difference. The output of each series from the function should always
+        be of the same length as the input.
+
+        >>> from typing import Iterator, Tuple
+        >>> from pyspark.sql.functions import struct, col
+        >>> @pandas_udf("long")
+        ... def multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]:
+        ...     for s1, df in iterator:
+        ...         yield s1 * df.v
+        ...
+        >>> df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
+        >>> df.withColumn('output', multiply(col("v"), struct(col("v")))).show()
+        +---+------+
+        |  v|output|
+        +---+------+
+        |  1|     1|
+        |  2|     4|
+        |  3|     9|
+        +---+------+
+
+        .. note:: The length of each series is the length of a batch internally used.
+
+    * Series to Scalar
+        `pandas.Series`, ... -> `Any`
+
+        The function takes `pandas.Series` and returns a scalar value. The `returnType`
+        should be a primitive data type, and the returned scalar can be either a python primitive
+        type, e.g., int or float or a numpy data type, e.g., numpy.int64 or numpy.float64.
+        `Any` should ideally be a specific scalar type accordingly.
+
+        >>> @pandas_udf("double")
+        ... def mean_udf(v: pd.Series) -> float:
+        ...     return v.mean()
+        ...
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
+        >>> df.groupby("id").agg(mean_udf(df['v'])).show()
+        +---+-----------+
+        | id|mean_udf(v)|
+        +---+-----------+
+        |  1|        1.5|
+        |  2|        6.0|
+        +---+-----------+
+
+        This UDF can also be used as window functions as below:
+
+        >>> from pyspark.sql import Window
+        >>> @pandas_udf("double")
+        ... def mean_udf(v: pd.Series) -> float:
+        ...     return v.mean()
+        ...
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
+        >>> w = Window.partitionBy('id').orderBy('v').rowsBetween(-1, 0)
+        >>> df.withColumn('mean_v', mean_udf("v").over(w)).show()
+        +---+----+------+
+        | id|   v|mean_v|
+        +---+----+------+
+        |  1| 1.0|   1.0|
+        |  1| 2.0|   1.5|
+        |  2| 3.0|   3.0|
+        |  2| 5.0|   4.0|
+        |  2|10.0|   7.5|
+        +---+----+------+
+
+        .. note:: For performance reasons, the input series to window functions are not copied.
             Therefore, mutating the input series is not allowed and will cause incorrect results.
             For the same reason, users should also not rely on the index of the input series.
 
-       .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window`
+        .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window`
 
     .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
@@ -348,10 +273,21 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     .. note:: The user-defined functions do not take keyword arguments on the calling side.
 
     .. note:: The data type of returned `pandas.Series` from the user-defined functions should be
-        matched with defined returnType (see :meth:`types.to_arrow_type` and
+        matched with defined `returnType` (see :meth:`types.to_arrow_type` and
         :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do
         conversion on returned data. The conversion is not guaranteed to be correct and results
         should be checked for accuracy by users.
+
+    .. note:: Currently,
+        :class:`pyspark.sql.types.MapType`,
+        :class:`pyspark.sql.types.ArrayType` of :class:`pyspark.sql.types.TimestampType` and
+        nested :class:`pyspark.sql.types.StructType`
+        are currently not supported as output types.
+
+    .. seealso:: :meth:`pyspark.sql.DataFrame.mapInPandas`
+    .. seealso:: :meth:`pyspark.sql.GroupedData.applyInPandas`
+    .. seealso:: :meth:`pyspark.sql.PandasCogroupedOps.applyInPandas`
+    .. seealso:: :meth:`pyspark.sql.UDFRegistration.register`
     """
 
     # The following table shows most of Pandas data and SQL type conversions in Pandas UDFs that
@@ -480,25 +416,3 @@ def _create_pandas_udf(f, returnType, evalType):
             "or three arguments (key, left, right).")
 
     return _create_udf(f, returnType, evalType)
-
-
-def _test():
-    import doctest
-    from pyspark.sql import SparkSession
-    import pyspark.sql.pandas.functions
-    globs = pyspark.sql.pandas.functions.__dict__.copy()
-    spark = SparkSession.builder\
-        .master("local[4]")\
-        .appName("sql.pandas.functions tests")\
-        .getOrCreate()
-    globs['spark'] = spark
-    (failure_count, test_count) = doctest.testmod(
-        pyspark.sql.pandas.functions, globs=globs,
-        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
-    spark.stop()
-    if failure_count:
-        sys.exit(-1)
-
-
-if __name__ == "__main__":
-    _test()
diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
index 3152271ba9df8..b93f0516cadb1 100644
--- a/python/pyspark/sql/pandas/group_ops.py
+++ b/python/pyspark/sql/pandas/group_ops.py
@@ -88,29 +88,27 @@ def applyInPandas(self, func, schema):
         to the user-function and the returned `pandas.DataFrame` are combined as a
         :class:`DataFrame`.
 
-        The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the
-        returnType of the pandas udf.
-
-        .. note:: This function requires a full shuffle. All the data of a group will be loaded
-            into memory, so the user should be aware of the potential OOM risk if data is skewed
-            and certain groups are too large to fit in memory.
+        The `schema` should be a :class:`StructType` describing the schema of the returned
+        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
+        the field names in the defined schema if specified as strings, or match the
+        field data types by position if not strings, e.g. integer indices.
+        The length of the returned `pandas.DataFrame` can be arbitrary.
 
         :param func: a Python native function that takes a `pandas.DataFrame`, and outputs a
             `pandas.DataFrame`.
         :param schema: the return type of the `func` in PySpark. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
-        .. note:: Experimental
-
-        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> import pandas as pd  # doctest: +SKIP
+        >>> from pyspark.sql.functions import pandas_udf, ceil
         >>> df = spark.createDataFrame(
         ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-        ...     ("id", "v"))
+        ...     ("id", "v"))  # doctest: +SKIP
         >>> def normalize(pdf):
         ...     v = pdf.v
         ...     return pdf.assign(v=(v - v.mean()) / v.std())
-        >>> df.groupby("id").applyInPandas(normalize, schema="id long, v double").show()
-        ... # doctest: +SKIP
+        >>> df.groupby("id").applyInPandas(
+        ...     normalize, schema="id long, v double").show()  # doctest: +SKIP
         +---+-------------------+
         | id|                  v|
         +---+-------------------+
@@ -121,8 +119,56 @@ def applyInPandas(self, func, schema):
         |  2| 1.1094003924504583|
         +---+-------------------+
 
-        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
+        Alternatively, the user can pass a function that takes two arguments.
+        In this case, the grouping key(s) will be passed as the first argument and the data will
+        be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy
+        data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in
+        as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.
+        This is useful when the user does not want to hardcode grouping key(s) in the function.
+
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+        ...     ("id", "v"))  # doctest: +SKIP
+        >>> def mean_func(key, pdf):
+        ...     # key is a tuple of one numpy.int64, which is the value
+        ...     # of 'id' for the current group
+        ...     return pd.DataFrame([key + (pdf.v.mean(),)])
+        >>> df.groupby('id').applyInPandas(
+        ...     mean_func, schema="id long, v double").show()  # doctest: +SKIP
+        +---+---+
+        | id|  v|
+        +---+---+
+        |  1|1.5|
+        |  2|6.0|
+        +---+---+
+        >>> def sum_func(key, pdf):
+        ...     # key is a tuple of two numpy.int64s, which is the values
+        ...     # of 'id' and 'ceil(df.v / 2)' for the current group
+        ...     return pd.DataFrame([key + (pdf.v.sum(),)])
+        >>> df.groupby(df.id, ceil(df.v / 2)).applyInPandas(
+        ...     sum_func, schema="id long, `ceil(v / 2)` long, v double").show()  # doctest: +SKIP
+        +---+-----------+----+
+        | id|ceil(v / 2)|   v|
+        +---+-----------+----+
+        |  2|          5|10.0|
+        |  1|          1| 3.0|
+        |  2|          3| 5.0|
+        |  2|          2| 3.0|
+        +---+-----------+----+
+
+        .. note:: This function requires a full shuffle. All the data of a group will be loaded
+            into memory, so the user should be aware of the potential OOM risk if data is skewed
+            and certain groups are too large to fit in memory.
+
+        .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
+            recommended to explicitly index the columns by name to ensure the positions are correct,
+            or alternatively use an `OrderedDict`.
+            For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
+            `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.
 
+        .. note:: Experimental
+
+        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
         """
         from pyspark.sql import GroupedData
         from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -176,14 +222,11 @@ def applyInPandas(self, func, schema):
         `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame` are combined as
         a :class:`DataFrame`.
 
-        The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the
-        returnType of the pandas udf.
-
-        .. note:: This function requires a full shuffle. All the data of a cogroup will be loaded
-            into memory, so the user should be aware of the potential OOM risk if data is skewed
-            and certain groups are too large to fit in memory.
-
-        .. note:: Experimental
+        The `schema` should be a :class:`StructType` describing the schema of the returned
+        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
+        the field names in the defined schema if specified as strings, or match the
+        field data types by position if not strings, e.g. integer indices.
+        The length of the returned `pandas.DataFrame` can be arbitrary.
 
         :param func: a Python native function that takes two `pandas.DataFrame`\\s, and
             outputs a `pandas.DataFrame`, or that takes one tuple (grouping keys) and two
@@ -191,7 +234,7 @@ def applyInPandas(self, func, schema):
         :param schema: the return type of the `func` in PySpark. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
-        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> from pyspark.sql.functions import pandas_udf
         >>> df1 = spark.createDataFrame(
         ...     [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
         ...     ("time", "id", "v1"))
@@ -232,6 +275,18 @@ def applyInPandas(self, func, schema):
         |20000102|  1|3.0|  x|
         +--------+---+---+---+
 
+        .. note:: This function requires a full shuffle. All the data of a cogroup will be loaded
+            into memory, so the user should be aware of the potential OOM risk if data is skewed
+            and certain groups are too large to fit in memory.
+
+        .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
+            recommended to explicitly index the columns by name to ensure the positions are correct,
+            or alternatively use an `OrderedDict`.
+            For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
+            `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.
+
+        .. note:: Experimental
+
         .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
 
         """
diff --git a/python/pyspark/sql/pandas/map_ops.py b/python/pyspark/sql/pandas/map_ops.py
index 75cacd797f9dd..9835e88c6ac21 100644
--- a/python/pyspark/sql/pandas/map_ops.py
+++ b/python/pyspark/sql/pandas/map_ops.py
@@ -45,10 +45,10 @@ def mapInPandas(self, func, schema):
         :param schema: the return type of the `func` in PySpark. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
-        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+        >>> from pyspark.sql.functions import pandas_udf
         >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
-        >>> def filter_func(batch_iter):
-        ...     for pdf in batch_iter:
+        >>> def filter_func(iterator):
+        ...     for pdf in iterator:
         ...         yield pdf[pdf.id == 1]
         >>> df.mapInPandas(filter_func, df.schema).show()  # doctest: +SKIP
         +---+---+
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 433c5fc845c59..10546ecacc57f 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -297,17 +297,18 @@ def register(self, name, f, returnType=None):
             >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
             [Row(random_udf()=82)]
 
-            >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
-            >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
-            ... def add_one(x):
-            ...     return x + 1
+            >>> import pandas as pd  # doctest: +SKIP
+            >>> from pyspark.sql.functions import pandas_udf
+            >>> @pandas_udf("integer")  # doctest: +SKIP
+            ... def add_one(s: pd.Series) -> pd.Series:
+            ...     return s + 1
             ...
             >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
             >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
             [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
 
-            >>> @pandas_udf("integer", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
-            ... def sum_udf(v):
+            >>> @pandas_udf("integer")  # doctest: +SKIP
+            ... def sum_udf(v: pd.Series) -> int:
             ...     return v.sum()
             ...
             >>> _ = spark.udf.register("sum_udf", sum_udf)  # doctest: +SKIP
@@ -414,6 +415,9 @@ def _test():
         .appName("sql.udf tests")\
         .getOrCreate()
     globs['spark'] = spark
+    # Hack to skip the unit tests in register. These are currently being tested in proper tests.
+    # We should reenable this test once we completely drop Python 2.
+    del pyspark.sql.udf.UDFRegistration.register
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.udf, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)

From b4769998efee0f5998104b689b710c11ee0dbd14 Mon Sep 17 00:00:00 2001
From: Kris Mok <kris.mok@databricks.com>
Date: Wed, 12 Feb 2020 15:19:16 +0900
Subject: [PATCH 021/185] [SPARK-30795][SQL] Spark SQL codegen's code()
 interpolator should treat escapes like Scala's StringContext.s()

### What changes were proposed in this pull request?

This PR proposes to make the `code` string interpolator treat escapes the same way as Scala's builtin `StringContext.s()` string interpolator. This will remove the need for an ugly workaround in `Like` expression's codegen.

### Why are the changes needed?

The `code()` string interpolator in Spark SQL's code generator should treat escapes like Scala's builtin `StringContext.s()` interpolator, i.e. it should treat escapes in the code parts, and should not treat escapes in the input arguments.

For example,
```scala
val arg = "This is an argument."
val str = s"This is string part 1. $arg This is string part 2."
val code = code"This is string part 1. $arg This is string part 2."
assert(code.toString == str)
```
We should expect the `code()` interpolator to produce the same result as the `StringContext.s()` interpolator, where only escapes in the string parts should be treated, while the args should be kept verbatim.

But in the current implementation, due to the eager folding of code parts and literal input args, the escape treatment is incorrectly done on both code parts and literal args.
That causes a problem when an arg contains escape sequences and wants to preserve that in the final produced code string. For example, in `Like` expression's codegen, there's an ugly workaround for this bug:
```scala
      // We need double escape to avoid org.codehaus.commons.compiler.CompileException.
      // '\\' will cause exception 'Single quote must be backslash-escaped in character literal'.
      // '\"' will cause exception 'Line break in literal not allowed'.
      val newEscapeChar = if (escapeChar == '\"' || escapeChar == '\\') {
        s"""\\\\\\$escapeChar"""
      } else {
        escapeChar
      }
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added a new unit test case in `CodeBlockSuite`.

Closes #27544 from rednaxelafx/fix-code-string-interpolator.

Authored-by: Kris Mok <kris.mok@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../sql/catalyst/expressions/codegen/javaCode.scala | 13 +++++++++----
 .../catalyst/expressions/regexpExpressions.scala    | 13 ++++---------
 .../expressions/codegen/CodeBlockSuite.scala        | 12 ++++++++++++
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
index d9393b9df6bbd..dff258902a0b1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
@@ -223,6 +223,11 @@ object Block {
   implicit def blocksToBlock(blocks: Seq[Block]): Block = blocks.reduceLeft(_ + _)
 
   implicit class BlockHelper(val sc: StringContext) extends AnyVal {
+    /**
+     * A string interpolator that retains references to the `JavaCode` inputs, and behaves like
+     * the Scala builtin StringContext.s() interpolator otherwise, i.e. it will treat escapes in
+     * the code parts, and will not treat escapes in the input arguments.
+     */
     def code(args: Any*): Block = {
       sc.checkLengths(args)
       if (sc.parts.length == 0) {
@@ -250,7 +255,7 @@ object Block {
     val inputs = args.iterator
     val buf = new StringBuilder(Block.CODE_BLOCK_BUFFER_LENGTH)
 
-    buf.append(strings.next)
+    buf.append(StringContext.treatEscapes(strings.next))
     while (strings.hasNext) {
       val input = inputs.next
       input match {
@@ -262,7 +267,7 @@ object Block {
         case _ =>
           buf.append(input)
       }
-      buf.append(strings.next)
+      buf.append(StringContext.treatEscapes(strings.next))
     }
     codeParts += buf.toString
 
@@ -286,10 +291,10 @@ case class CodeBlock(codeParts: Seq[String], blockInputs: Seq[JavaCode]) extends
     val strings = codeParts.iterator
     val inputs = blockInputs.iterator
     val buf = new StringBuilder(Block.CODE_BLOCK_BUFFER_LENGTH)
-    buf.append(StringContext.treatEscapes(strings.next))
+    buf.append(strings.next)
     while (strings.hasNext) {
       buf.append(inputs.next)
-      buf.append(StringContext.treatEscapes(strings.next))
+      buf.append(strings.next)
     }
     buf.toString
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index e5ee0edfcf79b..6a4d813d345b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -159,19 +159,14 @@ case class Like(left: Expression, right: Expression, escapeChar: Char)
     } else {
       val pattern = ctx.freshName("pattern")
       val rightStr = ctx.freshName("rightStr")
-      // We need double escape to avoid org.codehaus.commons.compiler.CompileException.
-      // '\\' will cause exception 'Single quote must be backslash-escaped in character literal'.
-      // '\"' will cause exception 'Line break in literal not allowed'.
-      val newEscapeChar = if (escapeChar == '\"' || escapeChar == '\\') {
-        s"""\\\\\\$escapeChar"""
-      } else {
-        escapeChar
-      }
+      // We need to escape the escapeChar to make sure the generated code is valid.
+      // Otherwise we'll hit org.codehaus.commons.compiler.CompileException.
+      val escapedEscapeChar = StringEscapeUtils.escapeJava(escapeChar.toString)
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
           String $rightStr = $eval2.toString();
           $patternClass $pattern = $patternClass.compile(
-            $escapeFunc($rightStr, '$newEscapeChar'));
+            $escapeFunc($rightStr, '$escapedEscapeChar'));
           ${ev.value} = $pattern.matcher($eval1.toString()).matches();
         """
       })
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
index 55569b6f2933e..67e3bc69543e8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala
@@ -37,6 +37,18 @@ class CodeBlockSuite extends SparkFunSuite {
     assert(code.asInstanceOf[CodeBlock].blockInputs === Seq(value))
   }
 
+  test("Code parts should be treated for escapes, but string inputs shouldn't be") {
+    val strlit = raw"\\"
+    val code = code"""String s = "foo\\bar" + "$strlit";"""
+
+    val builtin = s"""String s = "foo\\bar" + "$strlit";"""
+
+    val expected = raw"""String s = "foo\bar" + "\\";"""
+
+    assert(builtin == expected)
+    assert(code.asInstanceOf[CodeBlock].toString == expected)
+  }
+
   test("Block.stripMargin") {
     val isNull = JavaCode.isNullVariable("expr1_isNull")
     val value = JavaCode.variable("expr1", IntegerType)

From f5026b1ba7c05548d5f271d6d3edf7dfd4c3f9ce Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Wed, 12 Feb 2020 14:49:22 +0800
Subject: [PATCH 022/185] [SPARK-30763][SQL] Fix
 java.lang.IndexOutOfBoundsException No group 1 for regexp_extract

### What changes were proposed in this pull request?
The current implement of `regexp_extract` will throws a unprocessed exception show below:

`SELECT regexp_extract('1a 2b 14m', 'd+')`
```
java.lang.IndexOutOfBoundsException: No group 1
[info] at java.util.regex.Matcher.group(Matcher.java:538)
[info] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
[info] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
[info] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
```
I think should treat this exception well.

### Why are the changes needed?
Fix a bug `java.lang.IndexOutOfBoundsException No group 1 `

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
New UT

Closes #27508 from beliefer/fix-regexp_extract-bug.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/regexpExpressions.scala       | 15 +++-
 .../expressions/RegexpExpressionsSuite.scala  | 12 ++++
 .../sql-tests/inputs/regexp-functions.sql     |  9 +++
 .../results/regexp-functions.sql.out          | 69 +++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 6a4d813d345b3..3f60ca388a807 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -410,6 +410,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   }
 }
 
+object RegExpExtract {
+  def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = {
+    if (groupCount < groupIndex) {
+      throw new IllegalArgumentException(
+        s"Regex group count is $groupCount, but the specified group index is $groupIndex")
+    }
+  }
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
@@ -441,7 +450,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
     val m = pattern.matcher(s.toString)
     if (m.find) {
       val mr: MatchResult = m.toMatchResult
-      val group = mr.group(r.asInstanceOf[Int])
+      val index = r.asInstanceOf[Int]
+      RegExpExtract.checkGroupIndex(mr.groupCount, index)
+      val group = mr.group(index)
       if (group == null) { // Pattern matched, but not optional group
         UTF8String.EMPTY_UTF8
       } else {
@@ -459,6 +470,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val classNamePattern = classOf[Pattern].getCanonicalName
+    val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName
     val matcher = ctx.freshName("matcher")
     val matchResult = ctx.freshName("matchResult")
 
@@ -482,6 +494,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
         $termPattern.matcher($subject.toString());
       if ($matcher.find()) {
         java.util.regex.MatchResult $matchResult = $matcher.toMatchResult();
+        $classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx);
         if ($matchResult.group($idx) == null) {
           ${ev.value} = UTF8String.EMPTY_UTF8;
         } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 2c8794f083dbb..86da62bc74940 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -293,6 +293,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
     checkEvaluation(nonNullExpr, "100", row1)
+
+    // invalid group index
+    val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3)
+    val row9 = create_row("100-200", "(\\d+).*", 2)
+    val row10 = create_row("100-200", "\\d+", 1)
+
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row8, "Regex group count is 2, but the specified group index is 3")
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row9, "Regex group count is 1, but the specified group index is 2")
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row10, "Regex group count is 0, but the specified group index is 1")
   }
 
   test("SPLIT") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
new file mode 100644
index 0000000000000..c0827a3cba39b
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
@@ -0,0 +1,9 @@
+-- regexp_extract
+SELECT regexp_extract('1a 2b 14m', '\\d+');
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
new file mode 100644
index 0000000000000..c92c1ddca774f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
@@ -0,0 +1,69 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '\\d+')
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0)
+-- !query schema
+struct<regexp_extract(1a 2b 14m, \d+, 0):string>
+-- !query output
+1
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1)
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2)
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 2
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)')
+-- !query schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query output
+1
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0)
+-- !query schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string>
+-- !query output
+1a
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1)
+-- !query schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query output
+1
+
+
+-- !query
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2)
+-- !query schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string>
+-- !query output
+a

From 8b1839728acaa5e61f542a7332505289726d3162 Mon Sep 17 00:00:00 2001
From: turbofei <fwang12@ebay.com>
Date: Wed, 12 Feb 2020 20:21:52 +0900
Subject: [PATCH 023/185] [SPARK-29542][FOLLOW-UP] Keep the description of
 spark.sql.files.* in tuning guide be consistent with that in SQLConf

### What changes were proposed in this pull request?
This pr is a follow up of https://github.com/apache/spark/pull/26200.

In this PR, I modify the description of spark.sql.files.* in sql-performance-tuning.md to keep consistent with that in SQLConf.

### Why are the changes needed?

To keep consistent with the description in SQLConf.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existed UT.

Closes #27545 from turboFei/SPARK-29542-follow-up.

Authored-by: turbofei <fwang12@ebay.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/sql-performance-tuning.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/sql-performance-tuning.md b/docs/sql-performance-tuning.md
index e289854c7acc7..5a86c0cc31e12 100644
--- a/docs/sql-performance-tuning.md
+++ b/docs/sql-performance-tuning.md
@@ -67,6 +67,7 @@ that these options will be deprecated in future release as more optimizations ar
     <td>134217728 (128 MB)</td>
     <td>
       The maximum number of bytes to pack into a single partition when reading files.
+      This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.
     </td>
   </tr>
   <tr>
@@ -76,7 +77,8 @@ that these options will be deprecated in future release as more optimizations ar
       The estimated cost to open a file, measured by the number of bytes could be scanned in the same
       time. This is used when putting multiple files into a partition. It is better to over-estimated,
       then the partitions with small files will be faster than partitions with bigger files (which is
-      scheduled first).
+      scheduled first). This configuration is effective only when using file-based sources such as Parquet,
+      JSON and ORC.
     </td>
   </tr>
   <tr>

From c1986204e59f1e8cc4b611d5a578cb248cb74c28 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Wed, 12 Feb 2020 20:12:38 +0800
Subject: [PATCH 024/185] [SPARK-30788][SQL] Support `SimpleDateFormat` and
 `FastDateFormat` as legacy date/timestamp formatters

### What changes were proposed in this pull request?
In the PR, I propose to add legacy date/timestamp formatters based on `SimpleDateFormat` and `FastDateFormat`:
- `LegacyFastTimestampFormatter` - uses `FastDateFormat` and supports parsing/formatting in microsecond precision. The code was borrowed from Spark 2.4, see https://github.com/apache/spark/pull/26507 & https://github.com/apache/spark/pull/26582
- `LegacySimpleTimestampFormatter` uses `SimpleDateFormat`, and support the `lenient` mode. When the `lenient` parameter is set to `false`, the parser become much stronger in checking its input.

### Why are the changes needed?
Spark 2.4.x uses the following parsers for parsing/formatting date/timestamp strings:
- `DateTimeFormat` in CSV/JSON datasource
- `SimpleDateFormat` - is used in JDBC datasource, in partitions parsing.
- `SimpleDateFormat` in strong mode (`lenient = false`), see https://github.com/apache/spark/blob/branch-2.4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala#L124. It is used by the `date_format`, `from_unixtime`, `unix_timestamp` and `to_unix_timestamp` functions.

The PR aims to make Spark 3.0 compatible with Spark 2.4.x in all those cases when `spark.sql.legacy.timeParser.enabled` is set to `true`.

### Does this PR introduce any user-facing change?
This shouldn't change behavior with default settings. If `spark.sql.legacy.timeParser.enabled` is set to `true`, users should observe behavior of Spark 2.4.

### How was this patch tested?
- Modified tests in `DateExpressionsSuite` to check the legacy parser - `SimpleDateFormat`.
- Added `CSVLegacyTimeParserSuite` and `JsonLegacyTimeParserSuite` to run `CSVSuite` and `JsonSuite` with the legacy parser - `FastDateFormat`.

Closes #27524 from MaxGekk/timestamp-formatter-legacy-fallback.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/csv/CSVInferSchema.scala     |   4 +-
 .../spark/sql/catalyst/csv/CSVOptions.scala   |   4 +-
 .../sql/catalyst/csv/UnivocityGenerator.scala |   7 +-
 .../sql/catalyst/csv/UnivocityParser.scala    |   7 +-
 .../expressions/datetimeExpressions.scala     |  52 ++-
 .../spark/sql/catalyst/json/JSONOptions.scala |   4 +-
 .../sql/catalyst/json/JacksonGenerator.scala  |   7 +-
 .../sql/catalyst/json/JacksonParser.scala     |   7 +-
 .../sql/catalyst/json/JsonInferSchema.scala   |   4 +-
 .../sql/catalyst/util/DateFormatter.scala     |  66 ++-
 .../catalyst/util/TimestampFormatter.scala    | 132 +++++-
 .../org/apache/spark/sql/types/Decimal.scala  |   2 +-
 .../expressions/DateExpressionsSuite.scala    | 390 ++++++++++--------
 .../org/apache/spark/sql/functions.scala      |   7 +-
 .../resources/test-data/bad_after_good.csv    |   2 +-
 .../resources/test-data/value-malformed.csv   |   2 +-
 .../apache/spark/sql/DateFunctionsSuite.scala | 346 +++++++++-------
 .../execution/datasources/csv/CSVSuite.scala  |  23 +-
 .../datasources/json/JsonSuite.scala          |   7 +
 19 files changed, 654 insertions(+), 419 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index 03cc3cbdf790a..c6a03183ab45e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -24,6 +24,7 @@ import scala.util.control.Exception.allCatch
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.catalyst.util.TimestampFormatter
 import org.apache.spark.sql.types._
 
@@ -32,7 +33,8 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
   private val timestampParser = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   private val decimalParser = if (options.locale == Locale.US) {
     // Special handling the default locale for backward compatibility
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index 5e40d74e54f11..8892037e03a7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -146,10 +146,10 @@ class CSVOptions(
   // A language tag in IETF BCP 47 format
   val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
 
-  val dateFormat: String = parameters.getOrElse("dateFormat", "uuuu-MM-dd")
+  val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
 
   val timestampFormat: String =
-    parameters.getOrElse("timestampFormat", "uuuu-MM-dd'T'HH:mm:ss.SSSXXX")
+    parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
 
   val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
index 05cb91d10868e..00e3d49787db1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
@@ -23,6 +23,7 @@ import com.univocity.parsers.csv.CsvWriter
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.types._
 
 class UnivocityGenerator(
@@ -44,11 +45,13 @@ class UnivocityGenerator(
   private val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
   private val dateFormatter = DateFormatter(
     options.dateFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   private def makeConverter(dataType: DataType): ValueConverter = dataType match {
     case DateType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index 5510953804025..cd69c21a01976 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -27,6 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericInternalRow}
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -86,11 +87,13 @@ class UnivocityParser(
   private val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
   private val dateFormatter = DateFormatter(
     options.dateFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   private val csvFilters = new CSVFilters(filters, requiredSchema)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index aa2bd5a1273e0..1f4c8c041f8bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -30,9 +30,10 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
-import org.apache.spark.sql.catalyst.util.{DateTimeUtils, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.{DateTimeUtils, LegacyDateFormats, TimestampFormatter}
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.SIMPLE_DATE_FORMAT
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
@@ -622,13 +623,15 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
 
   @transient private lazy val formatter: Option[TimestampFormatter] = {
     if (right.foldable) {
-      Option(right.eval()).map(format => TimestampFormatter(format.toString, zoneId))
+      Option(right.eval()).map { format =>
+        TimestampFormatter(format.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
+      }
     } else None
   }
 
   override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
     val tf = if (formatter.isEmpty) {
-      TimestampFormatter(format.toString, zoneId)
+      TimestampFormatter(format.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
     } else {
       formatter.get
     }
@@ -643,10 +646,14 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
       })
     }.getOrElse {
       val tf = TimestampFormatter.getClass.getName.stripSuffix("$")
+      val ldf = LegacyDateFormats.getClass.getName.stripSuffix("$")
       val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
       defineCodeGen(ctx, ev, (timestamp, format) => {
-        s"""UTF8String.fromString($tf$$.MODULE$$.apply($format.toString(), $zid)
-          .format($timestamp))"""
+        s"""|UTF8String.fromString($tf$$.MODULE$$.apply(
+            |  $format.toString(),
+            |  $zid,
+            |  $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT())
+            |.format($timestamp))""".stripMargin
       })
     }
   }
@@ -688,7 +695,7 @@ case class ToUnixTimestamp(
     copy(timeZoneId = Option(timeZoneId))
 
   def this(time: Expression) = {
-    this(time, Literal("uuuu-MM-dd HH:mm:ss"))
+    this(time, Literal(TimestampFormatter.defaultPattern))
   }
 
   override def prettyName: String = "to_unix_timestamp"
@@ -732,7 +739,7 @@ case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Op
     copy(timeZoneId = Option(timeZoneId))
 
   def this(time: Expression) = {
-    this(time, Literal("uuuu-MM-dd HH:mm:ss"))
+    this(time, Literal(TimestampFormatter.defaultPattern))
   }
 
   def this() = {
@@ -758,7 +765,7 @@ abstract class ToTimestamp
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: TimestampFormatter =
     try {
-      TimestampFormatter(constFormat.toString, zoneId)
+      TimestampFormatter(constFormat.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
     } catch {
       case NonFatal(_) => null
     }
@@ -791,8 +798,8 @@ abstract class ToTimestamp
           } else {
             val formatString = f.asInstanceOf[UTF8String].toString
             try {
-              TimestampFormatter(formatString, zoneId).parse(
-                t.asInstanceOf[UTF8String].toString) / downScaleFactor
+              TimestampFormatter(formatString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
+                .parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
             } catch {
               case NonFatal(_) => null
             }
@@ -831,13 +838,16 @@ abstract class ToTimestamp
         }
       case StringType =>
         val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
-        val locale = ctx.addReferenceObj("locale", Locale.US)
         val tf = TimestampFormatter.getClass.getName.stripSuffix("$")
+        val ldf = LegacyDateFormats.getClass.getName.stripSuffix("$")
         nullSafeCodeGen(ctx, ev, (string, format) => {
           s"""
             try {
-              ${ev.value} = $tf$$.MODULE$$.apply($format.toString(), $zid, $locale)
-                .parse($string.toString()) / $downScaleFactor;
+              ${ev.value} = $tf$$.MODULE$$.apply(
+                $format.toString(),
+                $zid,
+                $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT())
+              .parse($string.toString()) / $downScaleFactor;
             } catch (java.lang.IllegalArgumentException e) {
               ${ev.isNull} = true;
             } catch (java.text.ParseException e) {
@@ -908,7 +918,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
   override def prettyName: String = "from_unixtime"
 
   def this(unix: Expression) = {
-    this(unix, Literal("uuuu-MM-dd HH:mm:ss"))
+    this(unix, Literal(TimestampFormatter.defaultPattern))
   }
 
   override def dataType: DataType = StringType
@@ -922,7 +932,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: TimestampFormatter =
     try {
-      TimestampFormatter(constFormat.toString, zoneId)
+      TimestampFormatter(constFormat.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
     } catch {
       case NonFatal(_) => null
     }
@@ -948,8 +958,9 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
           null
         } else {
           try {
-            UTF8String.fromString(TimestampFormatter(f.toString, zoneId)
-              .format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
+            UTF8String.fromString(
+              TimestampFormatter(f.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT)
+                .format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
           } catch {
             case NonFatal(_) => null
           }
@@ -980,13 +991,14 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
       }
     } else {
       val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
-      val locale = ctx.addReferenceObj("locale", Locale.US)
       val tf = TimestampFormatter.getClass.getName.stripSuffix("$")
+      val ldf = LegacyDateFormats.getClass.getName.stripSuffix("$")
       nullSafeCodeGen(ctx, ev, (seconds, f) => {
         s"""
         try {
-          ${ev.value} = UTF8String.fromString($tf$$.MODULE$$.apply($f.toString(), $zid, $locale).
-            format($seconds * 1000000L));
+          ${ev.value} = UTF8String.fromString(
+            $tf$$.MODULE$$.apply($f.toString(), $zid, $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT())
+              .format($seconds * 1000000L));
         } catch (java.lang.IllegalArgumentException e) {
           ${ev.isNull} = true;
         }"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index cdf4b4689e821..45c4edff47070 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -88,10 +88,10 @@ private[sql] class JSONOptions(
   val zoneId: ZoneId = DateTimeUtils.getZoneId(
     parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId))
 
-  val dateFormat: String = parameters.getOrElse("dateFormat", "uuuu-MM-dd")
+  val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
 
   val timestampFormat: String =
-    parameters.getOrElse("timestampFormat", "uuuu-MM-dd'T'HH:mm:ss.SSSXXX")
+    parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
 
   val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 9c63593ea1752..141360ff02117 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -24,6 +24,7 @@ import com.fasterxml.jackson.core._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.types._
 
 /**
@@ -80,11 +81,13 @@ private[sql] class JacksonGenerator(
   private val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
   private val dateFormatter = DateFormatter(
     options.dateFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   private def makeWriter(dataType: DataType): ValueWriter = dataType match {
     case NullType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index 76efa574a99ff..1e408cdb126b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -30,6 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -58,11 +59,13 @@ class JacksonParser(
   private val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
   private val dateFormatter = DateFormatter(
     options.dateFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   /**
    * Create a converter which converts the JSON documents held by the `JsonParser`
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index f030955ee6e7f..82dd6d0da2632 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
 import org.apache.spark.sql.catalyst.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -40,7 +41,8 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
   private val timestampFormatter = TimestampFormatter(
     options.timestampFormat,
     options.zoneId,
-    options.locale)
+    options.locale,
+    legacyFormat = FAST_DATE_FORMAT)
 
   /**
    * Infer the type of a collection of json records in three stages:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 28189b65dee9a..2cf82d1cfa177 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.catalyst.util
 
+import java.text.SimpleDateFormat
 import java.time.{LocalDate, ZoneId}
-import java.util.Locale
+import java.util.{Date, Locale}
 
 import org.apache.commons.lang3.time.FastDateFormat
 
@@ -51,41 +52,76 @@ class Iso8601DateFormatter(
   }
 }
 
-class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter {
-  @transient
-  private lazy val format = FastDateFormat.getInstance(pattern, locale)
+trait LegacyDateFormatter extends DateFormatter {
+  def parseToDate(s: String): Date
+  def formatDate(d: Date): String
 
   override def parse(s: String): Int = {
-    val milliseconds = format.parse(s).getTime
+    val milliseconds = parseToDate(s).getTime
     DateTimeUtils.millisToDays(milliseconds)
   }
 
   override def format(days: Int): String = {
     val date = DateTimeUtils.toJavaDate(days)
-    format.format(date)
+    formatDate(date)
   }
 }
 
+class LegacyFastDateFormatter(pattern: String, locale: Locale) extends LegacyDateFormatter {
+  @transient
+  private lazy val fdf = FastDateFormat.getInstance(pattern, locale)
+  override def parseToDate(s: String): Date = fdf.parse(s)
+  override def formatDate(d: Date): String = fdf.format(d)
+}
+
+class LegacySimpleDateFormatter(pattern: String, locale: Locale) extends LegacyDateFormatter {
+  @transient
+  private lazy val sdf = new SimpleDateFormat(pattern, locale)
+  override def parseToDate(s: String): Date = sdf.parse(s)
+  override def formatDate(d: Date): String = sdf.format(d)
+}
+
 object DateFormatter {
+  import LegacyDateFormats._
+
   val defaultLocale: Locale = Locale.US
 
-  def apply(format: String, zoneId: ZoneId, locale: Locale): DateFormatter = {
+  def defaultPattern(): String = {
+    if (SQLConf.get.legacyTimeParserEnabled) "yyyy-MM-dd" else "uuuu-MM-dd"
+  }
+
+  private def getFormatter(
+    format: Option[String],
+    zoneId: ZoneId,
+    locale: Locale = defaultLocale,
+    legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
+
+    val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserEnabled) {
-      new LegacyDateFormatter(format, locale)
+      legacyFormat match {
+        case FAST_DATE_FORMAT =>
+          new LegacyFastDateFormatter(pattern, locale)
+        case SIMPLE_DATE_FORMAT | LENIENT_SIMPLE_DATE_FORMAT =>
+          new LegacySimpleDateFormatter(pattern, locale)
+      }
     } else {
-      new Iso8601DateFormatter(format, zoneId, locale)
+      new Iso8601DateFormatter(pattern, zoneId, locale)
     }
   }
 
+  def apply(
+    format: String,
+    zoneId: ZoneId,
+    locale: Locale,
+    legacyFormat: LegacyDateFormat): DateFormatter = {
+    getFormatter(Some(format), zoneId, locale, legacyFormat)
+  }
+
   def apply(format: String, zoneId: ZoneId): DateFormatter = {
-    apply(format, zoneId, defaultLocale)
+    getFormatter(Some(format), zoneId)
   }
 
   def apply(zoneId: ZoneId): DateFormatter = {
-    if (SQLConf.get.legacyTimeParserEnabled) {
-      new LegacyDateFormatter("yyyy-MM-dd", defaultLocale)
-    } else {
-      new Iso8601DateFormatter("uuuu-MM-dd", zoneId, defaultLocale)
-    }
+    getFormatter(None, zoneId)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index fe1a4fe710c20..4893a7ec91cbb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -17,19 +17,20 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.text.ParseException
+import java.text.{ParseException, ParsePosition, SimpleDateFormat}
 import java.time._
 import java.time.format.DateTimeParseException
 import java.time.temporal.ChronoField.MICRO_OF_SECOND
 import java.time.temporal.TemporalQueries
-import java.util.{Locale, TimeZone}
+import java.util.{Calendar, GregorianCalendar, Locale, TimeZone}
 import java.util.concurrent.TimeUnit.SECONDS
 
 import org.apache.commons.lang3.time.FastDateFormat
 
-import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.convertSpecialTimestamp
+import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{ convertSpecialTimestamp, SQLTimestamp}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.Decimal
 
 sealed trait TimestampFormatter extends Serializable {
   /**
@@ -90,44 +91,139 @@ class FractionTimestampFormatter(zoneId: ZoneId)
   override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
 }
 
-class LegacyTimestampFormatter(
+/**
+ * The custom sub-class of `GregorianCalendar` is needed to get access to
+ * protected `fields` immediately after parsing. We cannot use
+ * the `get()` method because it performs normalization of the fraction
+ * part. Accordingly, the `MILLISECOND` field doesn't contain original value.
+ *
+ * Also this class allows to set raw value to the `MILLISECOND` field
+ * directly before formatting.
+ */
+class MicrosCalendar(tz: TimeZone, digitsInFraction: Int)
+  extends GregorianCalendar(tz, Locale.US) {
+  // Converts parsed `MILLISECOND` field to seconds fraction in microsecond precision.
+  // For example if the fraction pattern is `SSSS` then `digitsInFraction` = 4, and
+  // if the `MILLISECOND` field was parsed to `1234`.
+  def getMicros(): SQLTimestamp = {
+    // Append 6 zeros to the field: 1234 -> 1234000000
+    val d = fields(Calendar.MILLISECOND) * MICROS_PER_SECOND
+    // Take the first 6 digits from `d`: 1234000000 -> 123400
+    // The rest contains exactly `digitsInFraction`: `0000` = 10 ^ digitsInFraction
+    // So, the result is `(1234 * 1000000) / (10 ^ digitsInFraction)
+    d / Decimal.POW_10(digitsInFraction)
+  }
+
+  // Converts the seconds fraction in microsecond precision to a value
+  // that can be correctly formatted according to the specified fraction pattern.
+  // The method performs operations opposite to `getMicros()`.
+  def setMicros(micros: Long): Unit = {
+    val d = micros * Decimal.POW_10(digitsInFraction)
+    fields(Calendar.MILLISECOND) = (d / MICROS_PER_SECOND).toInt
+  }
+}
+
+class LegacyFastTimestampFormatter(
     pattern: String,
     zoneId: ZoneId,
     locale: Locale) extends TimestampFormatter {
 
-  @transient private lazy val format =
+  @transient private lazy val fastDateFormat =
     FastDateFormat.getInstance(pattern, TimeZone.getTimeZone(zoneId), locale)
+  @transient private lazy val cal = new MicrosCalendar(
+    fastDateFormat.getTimeZone,
+    fastDateFormat.getPattern.count(_ == 'S'))
+
+  def parse(s: String): SQLTimestamp = {
+    cal.clear() // Clear the calendar because it can be re-used many times
+    if (!fastDateFormat.parse(s, new ParsePosition(0), cal)) {
+      throw new IllegalArgumentException(s"'$s' is an invalid timestamp")
+    }
+    val micros = cal.getMicros()
+    cal.set(Calendar.MILLISECOND, 0)
+    cal.getTimeInMillis * MICROS_PER_MILLIS + micros
+  }
+
+  def format(timestamp: SQLTimestamp): String = {
+    cal.setTimeInMillis(Math.floorDiv(timestamp, MICROS_PER_SECOND) * MILLIS_PER_SECOND)
+    cal.setMicros(Math.floorMod(timestamp, MICROS_PER_SECOND))
+    fastDateFormat.format(cal)
+  }
+}
 
-  protected def toMillis(s: String): Long = format.parse(s).getTime
+class LegacySimpleTimestampFormatter(
+    pattern: String,
+    zoneId: ZoneId,
+    locale: Locale,
+    lenient: Boolean = true) extends TimestampFormatter {
+  @transient private lazy val sdf = {
+    val formatter = new SimpleDateFormat(pattern, locale)
+    formatter.setTimeZone(TimeZone.getTimeZone(zoneId))
+    formatter.setLenient(lenient)
+    formatter
+  }
 
-  override def parse(s: String): Long = toMillis(s) * MICROS_PER_MILLIS
+  override def parse(s: String): Long = {
+    sdf.parse(s).getTime * MICROS_PER_MILLIS
+  }
 
   override def format(us: Long): String = {
-    format.format(DateTimeUtils.toJavaTimestamp(us))
+    val timestamp = DateTimeUtils.toJavaTimestamp(us)
+    sdf.format(timestamp)
   }
 }
 
+object LegacyDateFormats extends Enumeration {
+  type LegacyDateFormat = Value
+  val FAST_DATE_FORMAT, SIMPLE_DATE_FORMAT, LENIENT_SIMPLE_DATE_FORMAT = Value
+}
+
 object TimestampFormatter {
+  import LegacyDateFormats._
+
   val defaultLocale: Locale = Locale.US
 
-  def apply(format: String, zoneId: ZoneId, locale: Locale): TimestampFormatter = {
+  def defaultPattern(): String = s"${DateFormatter.defaultPattern()} HH:mm:ss"
+
+  private def getFormatter(
+    format: Option[String],
+    zoneId: ZoneId,
+    locale: Locale = defaultLocale,
+    legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): TimestampFormatter = {
+
+    val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserEnabled) {
-      new LegacyTimestampFormatter(format, zoneId, locale)
+      legacyFormat match {
+        case FAST_DATE_FORMAT =>
+          new LegacyFastTimestampFormatter(pattern, zoneId, locale)
+        case SIMPLE_DATE_FORMAT =>
+          new LegacySimpleTimestampFormatter(pattern, zoneId, locale, lenient = false)
+        case LENIENT_SIMPLE_DATE_FORMAT =>
+          new LegacySimpleTimestampFormatter(pattern, zoneId, locale, lenient = true)
+      }
     } else {
-      new Iso8601TimestampFormatter(format, zoneId, locale)
+      new Iso8601TimestampFormatter(pattern, zoneId, locale)
     }
   }
 
+  def apply(
+    format: String,
+    zoneId: ZoneId,
+    locale: Locale,
+    legacyFormat: LegacyDateFormat): TimestampFormatter = {
+    getFormatter(Some(format), zoneId, locale, legacyFormat)
+  }
+
+  def apply(format: String, zoneId: ZoneId, legacyFormat: LegacyDateFormat): TimestampFormatter = {
+    getFormatter(Some(format), zoneId, defaultLocale, legacyFormat)
+  }
+
   def apply(format: String, zoneId: ZoneId): TimestampFormatter = {
-    apply(format, zoneId, defaultLocale)
+    getFormatter(Some(format), zoneId)
   }
 
   def apply(zoneId: ZoneId): TimestampFormatter = {
-    if (SQLConf.get.legacyTimeParserEnabled) {
-      new LegacyTimestampFormatter("yyyy-MM-dd HH:mm:ss", zoneId, defaultLocale)
-    } else {
-      new Iso8601TimestampFormatter("uuuu-MM-dd HH:mm:ss", zoneId, defaultLocale)
-    }
+    getFormatter(None, zoneId)
   }
 
   def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 9ce64b09f7870..f32e48e1cc128 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -541,7 +541,7 @@ object Decimal {
   /** Maximum number of decimal digits a Long can represent */
   val MAX_LONG_DIGITS = 18
 
-  private val POW_10 = Array.tabulate[Long](MAX_LONG_DIGITS + 1)(i => math.pow(10, i).toLong)
+  val POW_10 = Array.tabulate[Long](MAX_LONG_DIGITS + 1)(i => math.pow(10, i).toLong)
 
   private val BIG_DEC_ZERO = BigDecimal(0)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 274d0beebd300..f04149ab7eb29 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils, Timesta
 import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_SECOND
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneGMT
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
@@ -241,41 +242,45 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("DateFormat") {
-    checkEvaluation(
-      DateFormatClass(Literal.create(null, TimestampType), Literal("y"), gmtId),
-      null)
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
-      Literal.create(null, StringType), gmtId), null)
-
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
-      Literal("y"), gmtId), "2015")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), gmtId), "2013")
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
-      Literal("H"), gmtId), "0")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), gmtId), "13")
-
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, pstId),
-      Literal("y"), pstId), "2015")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), pstId), "2013")
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, pstId),
-      Literal("H"), pstId), "0")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), pstId), "5")
-
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, jstId),
-      Literal("y"), jstId), "2015")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), jstId), "2013")
-    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, jstId),
-      Literal("H"), jstId), "0")
-    checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), jstId), "22")
-
-    // SPARK-28072 The codegen path should work
-    checkEvaluation(
-      expression = DateFormatClass(
-        BoundReference(ordinal = 0, dataType = TimestampType, nullable = true),
-        BoundReference(ordinal = 1, dataType = StringType, nullable = true),
-        jstId),
-      expected = "22",
-      inputRow = InternalRow(DateTimeUtils.fromJavaTimestamp(ts), UTF8String.fromString("H")))
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkEvaluation(
+          DateFormatClass(Literal.create(null, TimestampType), Literal("y"), gmtId),
+          null)
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
+          Literal.create(null, StringType), gmtId), null)
+
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
+          Literal("y"), gmtId), "2015")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), gmtId), "2013")
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, gmtId),
+          Literal("H"), gmtId), "0")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), gmtId), "13")
+
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, pstId),
+          Literal("y"), pstId), "2015")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), pstId), "2013")
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, pstId),
+          Literal("H"), pstId), "0")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), pstId), "5")
+
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, jstId),
+          Literal("y"), jstId), "2015")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("y"), jstId), "2013")
+        checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType, jstId),
+          Literal("H"), jstId), "0")
+        checkEvaluation(DateFormatClass(Literal(ts), Literal("H"), jstId), "22")
+
+        // SPARK-28072 The codegen path should work
+        checkEvaluation(
+          expression = DateFormatClass(
+            BoundReference(ordinal = 0, dataType = TimestampType, nullable = true),
+            BoundReference(ordinal = 1, dataType = StringType, nullable = true),
+            jstId),
+          expected = "22",
+          inputRow = InternalRow(DateTimeUtils.fromJavaTimestamp(ts), UTF8String.fromString("H")))
+      }
+    }
   }
 
   test("Hour") {
@@ -705,162 +710,189 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("from_unixtime") {
-    val fmt1 = "yyyy-MM-dd HH:mm:ss"
-    val sdf1 = new SimpleDateFormat(fmt1, Locale.US)
-    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
-    for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
-      val timeZoneId = Option(tz.getID)
-      sdf1.setTimeZone(tz)
-      sdf2.setTimeZone(tz)
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val fmt1 = "yyyy-MM-dd HH:mm:ss"
+        val sdf1 = new SimpleDateFormat(fmt1, Locale.US)
+        val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+        val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
+        for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
+          val timeZoneId = Option(tz.getID)
+          sdf1.setTimeZone(tz)
+          sdf2.setTimeZone(tz)
 
-      checkEvaluation(
-        FromUnixTime(Literal(0L), Literal(fmt1), timeZoneId),
-        sdf1.format(new Timestamp(0)))
-      checkEvaluation(FromUnixTime(
-        Literal(1000L), Literal(fmt1), timeZoneId),
-        sdf1.format(new Timestamp(1000000)))
-      checkEvaluation(
-        FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId),
-        sdf2.format(new Timestamp(-1000000)))
-      checkEvaluation(
-        FromUnixTime(Literal.create(null, LongType), Literal.create(null, StringType), timeZoneId),
-        null)
-      checkEvaluation(
-        FromUnixTime(Literal.create(null, LongType), Literal(fmt1), timeZoneId),
-        null)
-      checkEvaluation(
-        FromUnixTime(Literal(1000L), Literal.create(null, StringType), timeZoneId),
-        null)
-      checkEvaluation(
-        FromUnixTime(Literal(0L), Literal("not a valid format"), timeZoneId), null)
+          checkEvaluation(
+            FromUnixTime(Literal(0L), Literal(fmt1), timeZoneId),
+            sdf1.format(new Timestamp(0)))
+          checkEvaluation(FromUnixTime(
+            Literal(1000L), Literal(fmt1), timeZoneId),
+            sdf1.format(new Timestamp(1000000)))
+          checkEvaluation(
+            FromUnixTime(Literal(-1000L), Literal(fmt2), timeZoneId),
+            sdf2.format(new Timestamp(-1000000)))
+          checkEvaluation(
+            FromUnixTime(
+              Literal.create(null, LongType),
+              Literal.create(null, StringType), timeZoneId),
+            null)
+          checkEvaluation(
+            FromUnixTime(Literal.create(null, LongType), Literal(fmt1), timeZoneId),
+            null)
+          checkEvaluation(
+            FromUnixTime(Literal(1000L), Literal.create(null, StringType), timeZoneId),
+            null)
+          checkEvaluation(
+            FromUnixTime(Literal(0L), Literal("not a valid format"), timeZoneId), null)
 
-      // SPARK-28072 The codegen path for non-literal input should also work
-      checkEvaluation(
-        expression = FromUnixTime(
-          BoundReference(ordinal = 0, dataType = LongType, nullable = true),
-          BoundReference(ordinal = 1, dataType = StringType, nullable = true),
-          timeZoneId),
-        expected = UTF8String.fromString(sdf1.format(new Timestamp(0))),
-        inputRow = InternalRow(0L, UTF8String.fromString(fmt1)))
+          // SPARK-28072 The codegen path for non-literal input should also work
+          checkEvaluation(
+            expression = FromUnixTime(
+              BoundReference(ordinal = 0, dataType = LongType, nullable = true),
+              BoundReference(ordinal = 1, dataType = StringType, nullable = true),
+              timeZoneId),
+            expected = UTF8String.fromString(sdf1.format(new Timestamp(0))),
+            inputRow = InternalRow(0L, UTF8String.fromString(fmt1)))
+        }
+      }
     }
   }
 
   test("unix_timestamp") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
-    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
-    val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
-    sdf3.setTimeZone(TimeZoneGMT)
-
-    withDefaultTimeZone(TimeZoneGMT) {
-      for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
-        val timeZoneId = Option(tz.getID)
-        sdf1.setTimeZone(tz)
-        sdf2.setTimeZone(tz)
-
-        val date1 = Date.valueOf("2015-07-24")
-        checkEvaluation(UnixTimestamp(
-          Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId), 0L)
-        checkEvaluation(UnixTimestamp(
-          Literal(sdf1.format(new Timestamp(1000000))), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
-          1000L)
-        checkEvaluation(
-          UnixTimestamp(
-            Literal(new Timestamp(1000000)), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
-          1000L)
-        checkEvaluation(
-          UnixTimestamp(Literal(date1), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
-        checkEvaluation(
-          UnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))), Literal(fmt2), timeZoneId),
-          -1000L)
-        checkEvaluation(UnixTimestamp(
-          Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
-            DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
-        val t1 = UnixTimestamp(
-          CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
-        val t2 = UnixTimestamp(
-          CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
-        assert(t2 - t1 <= 1)
-        checkEvaluation(
-          UnixTimestamp(
-            Literal.create(null, DateType), Literal.create(null, StringType), timeZoneId),
-          null)
-        checkEvaluation(
-          UnixTimestamp(Literal.create(null, DateType), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
-          null)
-        checkEvaluation(
-          UnixTimestamp(Literal(date1), Literal.create(null, StringType), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
-        checkEvaluation(
-          UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format"), timeZoneId), null)
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+        val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+        val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
+        val fmt3 = "yy-MM-dd"
+        val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
+        sdf3.setTimeZone(TimeZoneGMT)
+
+        withDefaultTimeZone(TimeZoneGMT) {
+          for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
+            val timeZoneId = Option(tz.getID)
+            sdf1.setTimeZone(tz)
+            sdf2.setTimeZone(tz)
+
+            val date1 = Date.valueOf("2015-07-24")
+            checkEvaluation(UnixTimestamp(
+              Literal(sdf1.format(new Timestamp(0))),
+              Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId), 0L)
+            checkEvaluation(UnixTimestamp(
+              Literal(sdf1.format(new Timestamp(1000000))),
+              Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
+              1000L)
+            checkEvaluation(
+              UnixTimestamp(
+                Literal(new Timestamp(1000000)), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
+              1000L)
+            checkEvaluation(
+              UnixTimestamp(Literal(date1), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
+              MILLISECONDS.toSeconds(
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+            checkEvaluation(
+              UnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))),
+                Literal(fmt2), timeZoneId),
+              -1000L)
+            checkEvaluation(UnixTimestamp(
+              Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
+              MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
+                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
+            val t1 = UnixTimestamp(
+              CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
+            val t2 = UnixTimestamp(
+              CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
+            assert(t2 - t1 <= 1)
+            checkEvaluation(
+              UnixTimestamp(
+                Literal.create(null, DateType), Literal.create(null, StringType), timeZoneId),
+              null)
+            checkEvaluation(
+              UnixTimestamp(
+                Literal.create(null, DateType),
+                Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
+              null)
+            checkEvaluation(
+              UnixTimestamp(Literal(date1), Literal.create(null, StringType), timeZoneId),
+              MILLISECONDS.toSeconds(
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+            checkEvaluation(
+              UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format"), timeZoneId), null)
+          }
+        }
       }
     }
   }
 
   test("to_unix_timestamp") {
-    val fmt1 = "yyyy-MM-dd HH:mm:ss"
-    val sdf1 = new SimpleDateFormat(fmt1, Locale.US)
-    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
-    val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
-    sdf3.setTimeZone(TimeZoneGMT)
-
-    withDefaultTimeZone(TimeZoneGMT) {
-      for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
-        val timeZoneId = Option(tz.getID)
-        sdf1.setTimeZone(tz)
-        sdf2.setTimeZone(tz)
-
-        val date1 = Date.valueOf("2015-07-24")
-        checkEvaluation(ToUnixTimestamp(
-          Literal(sdf1.format(new Timestamp(0))), Literal(fmt1), timeZoneId), 0L)
-        checkEvaluation(ToUnixTimestamp(
-          Literal(sdf1.format(new Timestamp(1000000))), Literal(fmt1), timeZoneId),
-          1000L)
-        checkEvaluation(ToUnixTimestamp(
-          Literal(new Timestamp(1000000)), Literal(fmt1)),
-          1000L)
-        checkEvaluation(
-          ToUnixTimestamp(Literal(date1), Literal(fmt1), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
-        checkEvaluation(
-          ToUnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))), Literal(fmt2), timeZoneId),
-          -1000L)
-        checkEvaluation(ToUnixTimestamp(
-          Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
-            DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
-        val t1 = ToUnixTimestamp(
-          CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
-        val t2 = ToUnixTimestamp(
-          CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
-        assert(t2 - t1 <= 1)
-        checkEvaluation(ToUnixTimestamp(
-          Literal.create(null, DateType), Literal.create(null, StringType), timeZoneId), null)
-        checkEvaluation(
-          ToUnixTimestamp(
-            Literal.create(null, DateType), Literal(fmt1), timeZoneId),
-          null)
-        checkEvaluation(ToUnixTimestamp(
-          Literal(date1), Literal.create(null, StringType), timeZoneId),
-          MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
-        checkEvaluation(
-          ToUnixTimestamp(Literal("2015-07-24"), Literal("not a valid format"), timeZoneId), null)
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val fmt1 = "yyyy-MM-dd HH:mm:ss"
+        val sdf1 = new SimpleDateFormat(fmt1, Locale.US)
+        val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+        val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
+        val fmt3 = "yy-MM-dd"
+        val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
+        sdf3.setTimeZone(TimeZoneGMT)
+
+        withDefaultTimeZone(TimeZoneGMT) {
+          for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) {
+            val timeZoneId = Option(tz.getID)
+            sdf1.setTimeZone(tz)
+            sdf2.setTimeZone(tz)
+
+            val date1 = Date.valueOf("2015-07-24")
+            checkEvaluation(ToUnixTimestamp(
+              Literal(sdf1.format(new Timestamp(0))), Literal(fmt1), timeZoneId), 0L)
+            checkEvaluation(ToUnixTimestamp(
+              Literal(sdf1.format(new Timestamp(1000000))), Literal(fmt1), timeZoneId),
+              1000L)
+            checkEvaluation(ToUnixTimestamp(
+              Literal(new Timestamp(1000000)), Literal(fmt1)),
+              1000L)
+            checkEvaluation(
+              ToUnixTimestamp(Literal(date1), Literal(fmt1), timeZoneId),
+              MILLISECONDS.toSeconds(
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+            checkEvaluation(
+              ToUnixTimestamp(
+                Literal(sdf2.format(new Timestamp(-1000000))),
+                Literal(fmt2), timeZoneId),
+              -1000L)
+            checkEvaluation(ToUnixTimestamp(
+              Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
+              MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
+                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
+            val t1 = ToUnixTimestamp(
+              CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
+            val t2 = ToUnixTimestamp(
+              CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
+            assert(t2 - t1 <= 1)
+            checkEvaluation(ToUnixTimestamp(
+              Literal.create(null, DateType), Literal.create(null, StringType), timeZoneId), null)
+            checkEvaluation(
+              ToUnixTimestamp(
+                Literal.create(null, DateType), Literal(fmt1), timeZoneId),
+              null)
+            checkEvaluation(ToUnixTimestamp(
+              Literal(date1), Literal.create(null, StringType), timeZoneId),
+              MILLISECONDS.toSeconds(
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+            checkEvaluation(
+              ToUnixTimestamp(
+                Literal("2015-07-24"),
+                Literal("not a valid format"), timeZoneId), null)
 
-        // SPARK-28072 The codegen path for non-literal input should also work
-        checkEvaluation(
-          expression = ToUnixTimestamp(
-            BoundReference(ordinal = 0, dataType = StringType, nullable = true),
-            BoundReference(ordinal = 1, dataType = StringType, nullable = true),
-            timeZoneId),
-          expected = 0L,
-          inputRow = InternalRow(
-            UTF8String.fromString(sdf1.format(new Timestamp(0))), UTF8String.fromString(fmt1)))
+            // SPARK-28072 The codegen path for non-literal input should also work
+            checkEvaluation(
+              expression = ToUnixTimestamp(
+                BoundReference(ordinal = 0, dataType = StringType, nullable = true),
+                BoundReference(ordinal = 1, dataType = StringType, nullable = true),
+                timeZoneId),
+              expected = 0L,
+              inputRow = InternalRow(
+                UTF8String.fromString(sdf1.format(new Timestamp(0))), UTF8String.fromString(fmt1)))
+          }
+        }
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d125581857e0b..2d5504ac00ffa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, ResolvedHint}
+import org.apache.spark.sql.catalyst.util.TimestampFormatter
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction}
 import org.apache.spark.sql.internal.SQLConf
@@ -2881,7 +2882,7 @@ object functions {
    * @since 1.5.0
    */
   def from_unixtime(ut: Column): Column = withExpr {
-    FromUnixTime(ut.expr, Literal("uuuu-MM-dd HH:mm:ss"))
+    FromUnixTime(ut.expr, Literal(TimestampFormatter.defaultPattern))
   }
 
   /**
@@ -2913,7 +2914,7 @@ object functions {
    * @since 1.5.0
    */
   def unix_timestamp(): Column = withExpr {
-    UnixTimestamp(CurrentTimestamp(), Literal("uuuu-MM-dd HH:mm:ss"))
+    UnixTimestamp(CurrentTimestamp(), Literal(TimestampFormatter.defaultPattern))
   }
 
   /**
@@ -2927,7 +2928,7 @@ object functions {
    * @since 1.5.0
    */
   def unix_timestamp(s: Column): Column = withExpr {
-    UnixTimestamp(s.expr, Literal("uuuu-MM-dd HH:mm:ss"))
+    UnixTimestamp(s.expr, Literal(TimestampFormatter.defaultPattern))
   }
 
   /**
diff --git a/sql/core/src/test/resources/test-data/bad_after_good.csv b/sql/core/src/test/resources/test-data/bad_after_good.csv
index 4621a7d23714d..1a7c2651a11a7 100644
--- a/sql/core/src/test/resources/test-data/bad_after_good.csv
+++ b/sql/core/src/test/resources/test-data/bad_after_good.csv
@@ -1,2 +1,2 @@
 "good record",1999-08-01
-"bad record",1999-088-01
+"bad record",1999-088_01
diff --git a/sql/core/src/test/resources/test-data/value-malformed.csv b/sql/core/src/test/resources/test-data/value-malformed.csv
index 8945ed73d2e83..6e6f08fca6df8 100644
--- a/sql/core/src/test/resources/test-data/value-malformed.csv
+++ b/sql/core/src/test/resources/test-data/value-malformed.csv
@@ -1,2 +1,2 @@
-0,2013-111-11 12:13:14
+0,2013-111_11 12:13:14
 1,1983-08-04
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index bb8cdf3cb6de1..41d53c959ef99 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -96,15 +96,19 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("date format") {
-    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
 
-    checkAnswer(
-      df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")),
-      Row("2015", "2015", "2013"))
+        checkAnswer(
+          df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")),
+          Row("2015", "2015", "2013"))
 
-    checkAnswer(
-      df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"),
-      Row("2015", "2015", "2013"))
+        checkAnswer(
+          df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"),
+          Row("2015", "2015", "2013"))
+      }
+    }
   }
 
   test("year") {
@@ -525,170 +529,194 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("from_unixtime") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
-    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
-    val fmt3 = "yy-MM-dd HH-mm-ss"
-    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
-    val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b")
-    checkAnswer(
-      df.select(from_unixtime(col("a"))),
-      Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
-    checkAnswer(
-      df.select(from_unixtime(col("a"), fmt2)),
-      Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
-    checkAnswer(
-      df.select(from_unixtime(col("a"), fmt3)),
-      Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
-    checkAnswer(
-      df.selectExpr("from_unixtime(a)"),
-      Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
-    checkAnswer(
-      df.selectExpr(s"from_unixtime(a, '$fmt2')"),
-      Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
-    checkAnswer(
-      df.selectExpr(s"from_unixtime(a, '$fmt3')"),
-      Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+        val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+        val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
+        val fmt3 = "yy-MM-dd HH-mm-ss"
+        val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
+        val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b")
+        checkAnswer(
+          df.select(from_unixtime(col("a"))),
+          Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
+        checkAnswer(
+          df.select(from_unixtime(col("a"), fmt2)),
+          Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
+        checkAnswer(
+          df.select(from_unixtime(col("a"), fmt3)),
+          Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
+        checkAnswer(
+          df.selectExpr("from_unixtime(a)"),
+          Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
+        checkAnswer(
+          df.selectExpr(s"from_unixtime(a, '$fmt2')"),
+          Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
+        checkAnswer(
+          df.selectExpr(s"from_unixtime(a, '$fmt3')"),
+          Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
+      }
+    }
   }
 
   private def secs(millis: Long): Long = TimeUnit.MILLISECONDS.toSeconds(millis)
 
   test("unix_timestamp") {
-    val date1 = Date.valueOf("2015-07-24")
-    val date2 = Date.valueOf("2015-07-25")
-    val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
-    val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
-    val s1 = "2015/07/24 10:00:00.5"
-    val s2 = "2015/07/25 02:02:02.6"
-    val ss1 = "2015-07-24 10:00:00"
-    val ss2 = "2015-07-25 02:02:02"
-    val fmt = "yyyy/MM/dd HH:mm:ss.S"
-    val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
-    checkAnswer(df.select(unix_timestamp(col("ts"))), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.select(unix_timestamp(col("ss"))), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq(
-      Row(secs(date1.getTime)), Row(secs(date2.getTime))))
-    checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq(
-      Row(secs(date1.getTime)), Row(secs(date2.getTime))))
-    checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-
-    val x1 = "2015-07-24 10:00:00"
-    val x2 = "2015-25-07 02:02:02"
-    val x3 = "2015-07-24 25:02:02"
-    val x4 = "2015-24-07 26:02:02"
-    val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
-    val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
-
-    val df1 = Seq(x1, x2, x3, x4).toDF("x")
-    checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
-      Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
-    checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
-      Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
-    checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq(
-      Row(null), Row(secs(ts2.getTime)), Row(null), Row(null)))
-    checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
-      Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
-
-    // invalid format
-    checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq(
-      Row(null), Row(null), Row(null), Row(null)))
-
-    // february
-    val y1 = "2016-02-29"
-    val y2 = "2017-02-29"
-    val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
-    val df2 = Seq(y1, y2).toDF("y")
-    checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
-      Row(secs(ts5.getTime)), Row(null)))
-
-    val now = sql("select unix_timestamp()").collect().head.getLong(0)
-    checkAnswer(
-      sql(s"select cast ($now as timestamp)"),
-      Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now))))
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val date1 = Date.valueOf("2015-07-24")
+        val date2 = Date.valueOf("2015-07-25")
+        val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
+        val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
+        val s1 = "2015/07/24 10:00:00.5"
+        val s2 = "2015/07/25 02:02:02.6"
+        val ss1 = "2015-07-24 10:00:00"
+        val ss2 = "2015-07-25 02:02:02"
+        val fmt = "yyyy/MM/dd HH:mm:ss.S"
+        val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
+        checkAnswer(df.select(unix_timestamp(col("ts"))), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.select(unix_timestamp(col("ss"))), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq(
+          Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+        checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq(
+          Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+        checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+
+        val x1 = "2015-07-24 10:00:00"
+        val x2 = "2015-25-07 02:02:02"
+        val x3 = "2015-07-24 25:02:02"
+        val x4 = "2015-24-07 26:02:02"
+        val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+        val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+        val df1 = Seq(x1, x2, x3, x4).toDF("x")
+        checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
+          Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+        checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
+          Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+        checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq(
+          Row(null), Row(secs(ts2.getTime)), Row(null), Row(null)))
+        checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+          Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
+
+        // invalid format
+        checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq(
+          Row(null), Row(null), Row(null), Row(null)))
+
+        // february
+        val y1 = "2016-02-29"
+        val y2 = "2017-02-29"
+        val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+        val df2 = Seq(y1, y2).toDF("y")
+        checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+          Row(secs(ts5.getTime)), Row(null)))
+
+        val now = sql("select unix_timestamp()").collect().head.getLong(0)
+        checkAnswer(
+          sql(s"select cast ($now as timestamp)"),
+          Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now))))
+      }
+    }
   }
 
   test("to_unix_timestamp") {
-    val date1 = Date.valueOf("2015-07-24")
-    val date2 = Date.valueOf("2015-07-25")
-    val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
-    val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
-    val s1 = "2015/07/24 10:00:00.5"
-    val s2 = "2015/07/25 02:02:02.6"
-    val ss1 = "2015-07-24 10:00:00"
-    val ss2 = "2015-07-25 02:02:02"
-    val fmt = "yyyy/MM/dd HH:mm:ss.S"
-    val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
-    checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-    checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq(
-      Row(secs(date1.getTime)), Row(secs(date2.getTime))))
-    checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
-      Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-
-    val x1 = "2015-07-24 10:00:00"
-    val x2 = "2015-25-07 02:02:02"
-    val x3 = "2015-07-24 25:02:02"
-    val x4 = "2015-24-07 26:02:02"
-    val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
-    val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
-
-    val df1 = Seq(x1, x2, x3, x4).toDF("x")
-    checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
-      Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
-    checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
-      Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
-
-    // february
-    val y1 = "2016-02-29"
-    val y2 = "2017-02-29"
-    val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
-    val df2 = Seq(y1, y2).toDF("y")
-    checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
-      Row(secs(ts5.getTime)), Row(null)))
-
-    // invalid format
-    checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq(
-      Row(null), Row(null), Row(null), Row(null)))
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val date1 = Date.valueOf("2015-07-24")
+        val date2 = Date.valueOf("2015-07-25")
+        val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
+        val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
+        val s1 = "2015/07/24 10:00:00.5"
+        val s2 = "2015/07/25 02:02:02.6"
+        val ss1 = "2015-07-24 10:00:00"
+        val ss2 = "2015-07-25 02:02:02"
+        val fmt = "yyyy/MM/dd HH:mm:ss.S"
+        val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
+        checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+        checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq(
+          Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+        checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
+          Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+
+        val x1 = "2015-07-24 10:00:00"
+        val x2 = "2015-25-07 02:02:02"
+        val x3 = "2015-07-24 25:02:02"
+        val x4 = "2015-24-07 26:02:02"
+        val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+        val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+        val df1 = Seq(x1, x2, x3, x4).toDF("x")
+        checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
+          Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+        checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+          Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
+
+        // february
+        val y1 = "2016-02-29"
+        val y2 = "2017-02-29"
+        val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+        val df2 = Seq(y1, y2).toDF("y")
+        checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+          Row(secs(ts5.getTime)), Row(null)))
+
+        // invalid format
+        checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq(
+          Row(null), Row(null), Row(null), Row(null)))
+      }
+    }
   }
 
 
   test("to_timestamp") {
-    val date1 = Date.valueOf("2015-07-24")
-    val date2 = Date.valueOf("2015-07-25")
-    val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00")
-    val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00")
-    val ts1 = Timestamp.valueOf("2015-07-24 10:00:00")
-    val ts2 = Timestamp.valueOf("2015-07-25 02:02:02")
-    val s1 = "2015/07/24 10:00:00.5"
-    val s2 = "2015/07/25 02:02:02.6"
-    val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5")
-    val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6")
-    val ss1 = "2015-07-24 10:00:00"
-    val ss2 = "2015-07-25 02:02:02"
-    val fmt = "yyyy/MM/dd HH:mm:ss.S"
-    val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
-
-    checkAnswer(df.select(to_timestamp(col("ss"))),
-      df.select(unix_timestamp(col("ss")).cast("timestamp")))
-    checkAnswer(df.select(to_timestamp(col("ss"))), Seq(
-      Row(ts1), Row(ts2)))
-    checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq(
-      Row(ts1m), Row(ts2m)))
-    checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq(
-      Row(ts1), Row(ts2)))
-    checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq(
-      Row(ts_date1), Row(ts_date2)))
+    Seq(false, true).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        val date1 = Date.valueOf("2015-07-24")
+        val date2 = Date.valueOf("2015-07-25")
+        val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00")
+        val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00")
+        val ts1 = Timestamp.valueOf("2015-07-24 10:00:00")
+        val ts2 = Timestamp.valueOf("2015-07-25 02:02:02")
+        val s1 = "2015/07/24 10:00:00.5"
+        val s2 = "2015/07/25 02:02:02.6"
+        val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5")
+        val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6")
+        val ss1 = "2015-07-24 10:00:00"
+        val ss2 = "2015-07-25 02:02:02"
+        val fmt = "yyyy/MM/dd HH:mm:ss.S"
+        val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
+
+        checkAnswer(df.select(to_timestamp(col("ss"))),
+          df.select(unix_timestamp(col("ss")).cast("timestamp")))
+        checkAnswer(df.select(to_timestamp(col("ss"))), Seq(
+          Row(ts1), Row(ts2)))
+        if (legacyParser) {
+          // In Spark 2.4 and earlier, to_timestamp() parses in seconds precision and cuts off
+          // the fractional part of seconds. The behavior was changed by SPARK-27438.
+          val legacyFmt = "yyyy/MM/dd HH:mm:ss"
+          checkAnswer(df.select(to_timestamp(col("s"), legacyFmt)), Seq(
+            Row(ts1), Row(ts2)))
+        } else {
+          checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq(
+            Row(ts1m), Row(ts2m)))
+        }
+        checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq(
+          Row(ts1), Row(ts2)))
+        checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq(
+          Row(ts_date1), Row(ts_date2)))
+      }
+    }
   }
 
   test("datediff") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 97dfbbdb7fd2f..b1105b4a63bba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1182,7 +1182,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
         .schema(schemaWithCorrField1)
         .csv(testFile(valueMalformedFile))
       checkAnswer(df2,
-        Row(0, null, "0,2013-111-11 12:13:14") ::
+        Row(0, null, "0,2013-111_11 12:13:14") ::
         Row(1, java.sql.Date.valueOf("1983-08-04"), null) ::
         Nil)
 
@@ -1199,7 +1199,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
         .schema(schemaWithCorrField2)
         .csv(testFile(valueMalformedFile))
       checkAnswer(df3,
-        Row(0, "0,2013-111-11 12:13:14", null) ::
+        Row(0, "0,2013-111_11 12:13:14", null) ::
         Row(1, null, java.sql.Date.valueOf("1983-08-04")) ::
         Nil)
 
@@ -1435,7 +1435,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
     assert(df.filter($"_corrupt_record".isNull).count() == 1)
     checkAnswer(
       df.select(columnNameOfCorruptRecord),
-      Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
+      Row("0,2013-111_11 12:13:14") :: Row(null) :: Nil
     )
   }
 
@@ -2093,7 +2093,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
     Seq("csv", "").foreach { reader =>
       withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> reader) {
         withTempPath { path =>
-          val df = Seq(("0", "2013-111-11")).toDF("a", "b")
+          val df = Seq(("0", "2013-111_11")).toDF("a", "b")
           df.write
             .option("header", "true")
             .csv(path.getAbsolutePath)
@@ -2109,7 +2109,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
             .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
             .schema(schemaWithCorrField)
             .csv(path.getAbsoluteFile.toString)
-          checkAnswer(readDF, Row(0, null, "0,2013-111-11") :: Nil)
+          checkAnswer(readDF, Row(0, null, "0,2013-111_11") :: Nil)
         }
       }
     }
@@ -2216,7 +2216,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
                 val readback = spark.read
                   .option("mode", mode)
                   .option("header", true)
-                  .option("timestampFormat", "uuuu-MM-dd HH:mm:ss")
+                  .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
                   .option("multiLine", multiLine)
                   .schema("c0 string, c1 integer, c2 timestamp")
                   .csv(path.getAbsolutePath)
@@ -2235,7 +2235,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
   }
 
   test("filters push down - malformed input in PERMISSIVE mode") {
-    val invalidTs = "2019-123-14 20:35:30"
+    val invalidTs = "2019-123_14 20:35:30"
     val invalidRow = s"0,$invalidTs,999"
     val validTs = "2019-12-14 20:35:30"
     Seq(true, false).foreach { filterPushdown =>
@@ -2252,7 +2252,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
               .option("mode", "PERMISSIVE")
               .option("columnNameOfCorruptRecord", "c3")
               .option("header", true)
-              .option("timestampFormat", "uuuu-MM-dd HH:mm:ss")
+              .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
               .schema("c0 integer, c1 timestamp, c2 integer, c3 string")
               .csv(path.getAbsolutePath)
               .where(condition)
@@ -2309,3 +2309,10 @@ class CSVv2Suite extends CSVSuite {
       .sparkConf
       .set(SQLConf.USE_V1_SOURCE_LIST, "")
 }
+
+class CSVLegacyTimeParserSuite extends CSVSuite {
+  override protected def sparkConf: SparkConf =
+    super
+      .sparkConf
+      .set(SQLConf.LEGACY_TIME_PARSER_ENABLED, true)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index b20da2266b0f3..7abe818a29d9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2572,3 +2572,10 @@ class JsonV2Suite extends JsonSuite {
       .sparkConf
       .set(SQLConf.USE_V1_SOURCE_LIST, "")
 }
+
+class JsonLegacyTimeParserSuite extends JsonSuite {
+  override protected def sparkConf: SparkConf =
+    super
+      .sparkConf
+      .set(SQLConf.LEGACY_TIME_PARSER_ENABLED, true)
+}

From 61b1e608f07afd965028313c13bf89c19b006312 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Wed, 12 Feb 2020 23:50:34 +0800
Subject: [PATCH 025/185] [SPARK-30759][SQL][TESTS][FOLLOWUP] Check cache
 initialization in StringRegexExpression

### What changes were proposed in this pull request?
Added new test to `RegexpExpressionsSuite` which checks that `cache` of compiled pattern is set when the `right` expression (pattern in `LIKE`) is a foldable expression.

### Why are the changes needed?
To be sure that `cache` in `StringRegexExpression` is initialized for foldable patterns.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By running the added test in `RegexpExpressionsSuite`.

Closes #27547 from MaxGekk/regexp-cache-test.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/RegexpExpressionsSuite.scala | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 86da62bc74940..712d2bc4c4736 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -329,4 +329,12 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringSplit(s1, s2, -1), null, row3)
   }
 
+  test("SPARK-30759: cache initialization for literal patterns") {
+    val expr = "A" like Literal.create("a", StringType)
+    expr.eval()
+    val cache = expr.getClass.getSuperclass
+      .getDeclaredFields.filter(_.getName.endsWith("cache")).head
+    cache.setAccessible(true)
+    assert(cache.get(expr).asInstanceOf[java.util.regex.Pattern].pattern().contains("a"))
+  }
 }

From 5919bd3b8d3ef3c3e957d8e3e245e00383b979bf Mon Sep 17 00:00:00 2001
From: Eric Wu <492960551@qq.com>
Date: Thu, 13 Feb 2020 02:00:23 +0800
Subject: [PATCH 026/185] [SPARK-30651][SQL] Add detailed information for
 Aggregate operators in EXPLAIN FORMATTED

### What changes were proposed in this pull request?
Currently `EXPLAIN FORMATTED` only report input attributes of HashAggregate/ObjectHashAggregate/SortAggregate, while `EXPLAIN EXTENDED` provides more information of Keys, Functions, etc. This PR enhanced `EXPLAIN FORMATTED` to sync with original explain behavior.

### Why are the changes needed?
The newly added `EXPLAIN FORMATTED` got less information comparing to the original `EXPLAIN EXTENDED`

### Does this PR introduce any user-facing change?
Yes, taking HashAggregate explain result as example.

**SQL**
```
EXPLAIN FORMATTED
  SELECT
    COUNT(val) + SUM(key) as TOTAL,
    COUNT(key) FILTER (WHERE val > 1)
  FROM explain_temp1;
```

**EXPLAIN EXTENDED**
```
== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[count(val#6), sum(cast(key#5 as bigint)), count(key#5)], output=[TOTAL#62L, count(key) FILTER (WHERE (val > 1))#71L])
+- Exchange SinglePartition, true, [id=#89]
   +- HashAggregate(keys=[], functions=[partial_count(val#6), partial_sum(cast(key#5 as bigint)), partial_count(key#5) FILTER (WHERE (val#6 > 1))], output=[count#75L, sum#76L, count#77L])
      +- *(1) ColumnarToRow
         +- FileScan parquet default.explain_temp1[key#5,val#6] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex[file:/Users/XXX/spark-dev/spark/spark-warehouse/explain_temp1], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<key:int,val:int>
```

**EXPLAIN FORMATTED - BEFORE**
```
== Physical Plan ==
* HashAggregate (5)
+- Exchange (4)
   +- HashAggregate (3)
      +- * ColumnarToRow (2)
         +- Scan parquet default.explain_temp1 (1)

...
...
(5) HashAggregate [codegen id : 2]
Input: [count#91L, sum#92L, count#93L]
...
...
```

**EXPLAIN FORMATTED - AFTER**
```
== Physical Plan ==
* HashAggregate (5)
+- Exchange (4)
   +- HashAggregate (3)
      +- * ColumnarToRow (2)
         +- Scan parquet default.explain_temp1 (1)

...
...
(5) HashAggregate [codegen id : 2]
Input: [count#91L, sum#92L, count#93L]
Keys: []
Functions: [count(val#6), sum(cast(key#5 as bigint)), count(key#5)]
Results: [(count(val#6)#84L + sum(cast(key#5 as bigint))#85L) AS TOTAL#78L, count(key#5)#86L AS count(key) FILTER (WHERE (val > 1))#87L]
Output: [TOTAL#78L, count(key) FILTER (WHERE (val > 1))#87L]
...
...
```

### How was this patch tested?
Three tests added in explain.sql for HashAggregate/ObjectHashAggregate/SortAggregate.

Closes #27368 from Eric5553/ExplainFormattedAgg.

Authored-by: Eric Wu <492960551@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../aggregate/BaseAggregateExec.scala         |  48 ++++
 .../aggregate/HashAggregateExec.scala         |   2 +-
 .../aggregate/ObjectHashAggregateExec.scala   |   2 +-
 .../aggregate/SortAggregateExec.scala         |   4 +-
 .../resources/sql-tests/inputs/explain.sql    |  22 +-
 .../sql-tests/results/explain.sql.out         | 232 +++++++++++++++++-
 6 files changed, 300 insertions(+), 10 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
new file mode 100644
index 0000000000000..0eaa0f53fdacd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
+import org.apache.spark.sql.execution.{ExplainUtils, UnaryExecNode}
+
+/**
+ * Holds common logic for aggregate operators
+ */
+trait BaseAggregateExec extends UnaryExecNode {
+  def groupingExpressions: Seq[NamedExpression]
+  def aggregateExpressions: Seq[AggregateExpression]
+  def aggregateAttributes: Seq[Attribute]
+  def resultExpressions: Seq[NamedExpression]
+
+  override def verboseStringWithOperatorId(): String = {
+    val inputString = child.output.mkString("[", ", ", "]")
+    val keyString = groupingExpressions.mkString("[", ", ", "]")
+    val functionString = aggregateExpressions.mkString("[", ", ", "]")
+    val aggregateAttributeString = aggregateAttributes.mkString("[", ", ", "]")
+    val resultString = resultExpressions.mkString("[", ", ", "]")
+    s"""
+       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
+       |Input: $inputString
+       |Keys: $keyString
+       |Functions: $functionString
+       |Aggregate Attributes: $aggregateAttributeString
+       |Results: $resultString
+     """.stripMargin
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index f73e214a6b41f..7a26fd7a8541a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -53,7 +53,7 @@ case class HashAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode with BlockingOperatorWithCodegen with AliasAwareOutputPartitioning {
+  extends BaseAggregateExec with BlockingOperatorWithCodegen with AliasAwareOutputPartitioning {
 
   private[this] val aggregateBufferAttributes = {
     aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
index 4376f6b6edd57..3fb58eb2cc8ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
@@ -67,7 +67,7 @@ case class ObjectHashAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode with AliasAwareOutputPartitioning {
+  extends BaseAggregateExec with AliasAwareOutputPartitioning {
 
   private[this] val aggregateBufferAttributes = {
     aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
index b6e684e62ea5c..77ed469016fa3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.execution.{AliasAwareOutputPartitioning, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.{AliasAwareOutputPartitioning, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
@@ -38,7 +38,7 @@ case class SortAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode with AliasAwareOutputPartitioning {
+  extends BaseAggregateExec with AliasAwareOutputPartitioning {
 
   private[this] val aggregateBufferAttributes = {
     aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/explain.sql b/sql/core/src/test/resources/sql-tests/inputs/explain.sql
index d5253e3daddb0..497b61c6134a2 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/explain.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/explain.sql
@@ -5,6 +5,7 @@
 CREATE table  explain_temp1 (key int, val int) USING PARQUET;
 CREATE table  explain_temp2 (key int, val int) USING PARQUET;
 CREATE table  explain_temp3 (key int, val int) USING PARQUET;
+CREATE table  explain_temp4 (key int, val string) USING PARQUET;
 
 SET spark.sql.codegen.wholeStage = true;
 
@@ -61,7 +62,7 @@ EXPLAIN FORMATTED
                 FROM   explain_temp2 
                 WHERE  val > 0) 
          OR
-         key = (SELECT max(key) 
+         key = (SELECT avg(key)
                 FROM   explain_temp3
                 WHERE  val > 0);
 
@@ -93,6 +94,25 @@ EXPLAIN FORMATTED
   CREATE VIEW explain_view AS
     SELECT key, val FROM explain_temp1;
 
+-- HashAggregate
+EXPLAIN FORMATTED
+  SELECT
+    COUNT(val) + SUM(key) as TOTAL,
+    COUNT(key) FILTER (WHERE val > 1)
+  FROM explain_temp1;
+
+-- ObjectHashAggregate
+EXPLAIN FORMATTED
+  SELECT key, sort_array(collect_set(val))[0]
+  FROM explain_temp4
+  GROUP BY key;
+
+-- SortAggregate
+EXPLAIN FORMATTED
+  SELECT key, MIN(val)
+  FROM explain_temp4
+  GROUP BY key;
+
 -- cleanup
 DROP TABLE explain_temp1;
 DROP TABLE explain_temp2;
diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
index 756c14f28a657..bc28d7f87bf00 100644
--- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 18
+-- Number of queries: 22
 
 
 -- !query
@@ -26,6 +26,14 @@ struct<>
 
 
 
+-- !query
+CREATE table  explain_temp4 (key int, val string) USING PARQUET
+-- !query schema
+struct<>
+-- !query output
+
+
+
 -- !query
 SET spark.sql.codegen.wholeStage = true
 -- !query schema
@@ -76,12 +84,20 @@ Input     : [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
 Input: [key#x, val#x]
+Keys: [key#x]
+Functions: [partial_max(val#x)]
+Aggregate Attributes: [max#x]
+Results: [key#x, max#x]
      
 (6) Exchange 
 Input: [key#x, max#x]
      
 (7) HashAggregate [codegen id : 2]
 Input: [key#x, max#x]
+Keys: [key#x]
+Functions: [max(val#x)]
+Aggregate Attributes: [max(val#x)#x]
+Results: [key#x, max(val#x)#x AS max(val)#x]
      
 (8) Exchange 
 Input: [key#x, max(val)#x]
@@ -132,12 +148,20 @@ Input     : [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
 Input: [key#x, val#x]
+Keys: [key#x]
+Functions: [partial_max(val#x)]
+Aggregate Attributes: [max#x]
+Results: [key#x, max#x]
      
 (6) Exchange 
 Input: [key#x, max#x]
      
 (7) HashAggregate [codegen id : 2]
 Input: [key#x, max#x]
+Keys: [key#x]
+Functions: [max(val#x)]
+Aggregate Attributes: [max(val#x)#x]
+Results: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x]
      
 (8) Filter [codegen id : 2]
 Input     : [key#x, max(val)#x, max(val#x)#x]
@@ -211,12 +235,20 @@ Input     : [key#x, val#x]
      
 (10) HashAggregate [codegen id : 3]
 Input: [key#x, val#x]
+Keys: [key#x, val#x]
+Functions: []
+Aggregate Attributes: []
+Results: [key#x, val#x]
      
 (11) Exchange 
 Input: [key#x, val#x]
      
 (12) HashAggregate [codegen id : 4]
 Input: [key#x, val#x]
+Keys: [key#x, val#x]
+Functions: []
+Aggregate Attributes: []
+Results: [key#x, val#x]
 
 
 -- !query
@@ -413,12 +445,20 @@ Input     : [key#x, val#x]
      
 (9) HashAggregate [codegen id : 1]
 Input: [key#x]
+Keys: []
+Functions: [partial_max(key#x)]
+Aggregate Attributes: [max#x]
+Results: [max#x]
      
 (10) Exchange 
 Input: [max#x]
      
 (11) HashAggregate [codegen id : 2]
 Input: [max#x]
+Keys: []
+Functions: [max(key#x)]
+Aggregate Attributes: [max(key#x)#x]
+Results: [max(key#x)#x AS max(key)#x]
      
 Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery#x, [id=#x]
 * HashAggregate (18)
@@ -450,12 +490,20 @@ Input     : [key#x, val#x]
      
 (16) HashAggregate [codegen id : 1]
 Input: [key#x]
+Keys: []
+Functions: [partial_max(key#x)]
+Aggregate Attributes: [max#x]
+Results: [max#x]
      
 (17) Exchange 
 Input: [max#x]
      
 (18) HashAggregate [codegen id : 2]
 Input: [max#x]
+Keys: []
+Functions: [max(key#x)]
+Aggregate Attributes: [max(key#x)#x]
+Results: [max(key#x)#x AS max(key)#x]
 
 
 -- !query
@@ -466,7 +514,7 @@ EXPLAIN FORMATTED
                 FROM   explain_temp2 
                 WHERE  val > 0) 
          OR
-         key = (SELECT max(key) 
+         key = (SELECT avg(key)
                 FROM   explain_temp3
                 WHERE  val > 0)
 -- !query schema
@@ -489,7 +537,7 @@ Input: [key#x, val#x]
      
 (3) Filter [codegen id : 1]
 Input     : [key#x, val#x]
-Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (key#x = Subquery scalar-subquery#x, [id=#x]))
+Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (cast(key#x as double) = Subquery scalar-subquery#x, [id=#x]))
      
 ===== Subqueries =====
 
@@ -523,12 +571,20 @@ Input     : [key#x, val#x]
      
 (8) HashAggregate [codegen id : 1]
 Input: [key#x]
+Keys: []
+Functions: [partial_max(key#x)]
+Aggregate Attributes: [max#x]
+Results: [max#x]
      
 (9) Exchange 
 Input: [max#x]
      
 (10) HashAggregate [codegen id : 2]
 Input: [max#x]
+Keys: []
+Functions: [max(key#x)]
+Aggregate Attributes: [max(key#x)#x]
+Results: [max(key#x)#x AS max(key)#x]
      
 Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x]
 * HashAggregate (17)
@@ -560,12 +616,20 @@ Input     : [key#x, val#x]
      
 (15) HashAggregate [codegen id : 1]
 Input: [key#x]
+Keys: []
+Functions: [partial_avg(cast(key#x as bigint))]
+Aggregate Attributes: [sum#x, count#xL]
+Results: [sum#x, count#xL]
      
 (16) Exchange 
-Input: [max#x]
+Input: [sum#x, count#xL]
      
 (17) HashAggregate [codegen id : 2]
-Input: [max#x]
+Input: [sum#x, count#xL]
+Keys: []
+Functions: [avg(cast(key#x as bigint))]
+Aggregate Attributes: [avg(cast(key#x as bigint))#x]
+Results: [avg(cast(key#x as bigint))#x AS avg(key)#x]
 
 
 -- !query
@@ -615,12 +679,20 @@ Input: [key#x]
      
 (6) HashAggregate [codegen id : 1]
 Input: [key#x]
+Keys: []
+Functions: [partial_avg(cast(key#x as bigint))]
+Aggregate Attributes: [sum#x, count#xL]
+Results: [sum#x, count#xL]
      
 (7) Exchange 
 Input: [sum#x, count#xL]
      
 (8) HashAggregate [codegen id : 2]
 Input: [sum#x, count#xL]
+Keys: []
+Functions: [avg(cast(key#x as bigint))]
+Aggregate Attributes: [avg(cast(key#x as bigint))#x]
+Results: [avg(cast(key#x as bigint))#x AS avg(key)#x]
      
 Subquery:2 Hosting operator id = 3 Hosting Expression = ReusedSubquery Subquery scalar-subquery#x, [id=#x]
 
@@ -740,18 +812,30 @@ Input     : [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
 Input: [key#x, val#x]
+Keys: [key#x]
+Functions: [partial_max(val#x)]
+Aggregate Attributes: [max#x]
+Results: [key#x, max#x]
      
 (6) Exchange 
 Input: [key#x, max#x]
      
 (7) HashAggregate [codegen id : 4]
 Input: [key#x, max#x]
+Keys: [key#x]
+Functions: [max(val#x)]
+Aggregate Attributes: [max(val#x)#x]
+Results: [key#x, max(val#x)#x AS max(val)#x]
      
 (8) ReusedExchange  [Reuses operator id: 6]
 Output : ArrayBuffer(key#x, max#x)
      
 (9) HashAggregate [codegen id : 3]
 Input: [key#x, max#x]
+Keys: [key#x]
+Functions: [max(val#x)]
+Aggregate Attributes: [max(val#x)#x]
+Results: [key#x, max(val#x)#x AS max(val)#x]
      
 (10) BroadcastExchange 
 Input: [key#x, max(val)#x]
@@ -786,6 +870,144 @@ Output: []
 (4) Project
 
 
+-- !query
+EXPLAIN FORMATTED
+  SELECT
+    COUNT(val) + SUM(key) as TOTAL,
+    COUNT(key) FILTER (WHERE val > 1)
+  FROM explain_temp1
+-- !query schema
+struct<plan:string>
+-- !query output
+== Physical Plan ==
+* HashAggregate (5)
++- Exchange (4)
+   +- HashAggregate (3)
+      +- * ColumnarToRow (2)
+         +- Scan parquet default.explain_temp1 (1)
+
+
+(1) Scan parquet default.explain_temp1 
+Output: [key#x, val#x]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/explain_temp1]
+ReadSchema: struct<key:int,val:int>
+     
+(2) ColumnarToRow [codegen id : 1]
+Input: [key#x, val#x]
+     
+(3) HashAggregate 
+Input: [key#x, val#x]
+Keys: []
+Functions: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))]
+Aggregate Attributes: [count#xL, sum#xL, count#xL]
+Results: [count#xL, sum#xL, count#xL]
+     
+(4) Exchange 
+Input: [count#xL, sum#xL, count#xL]
+     
+(5) HashAggregate [codegen id : 2]
+Input: [count#xL, sum#xL, count#xL]
+Keys: []
+Functions: [count(val#x), sum(cast(key#x as bigint)), count(key#x)]
+Aggregate Attributes: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL]
+Results: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL]
+
+
+-- !query
+EXPLAIN FORMATTED
+  SELECT key, sort_array(collect_set(val))[0]
+  FROM explain_temp4
+  GROUP BY key
+-- !query schema
+struct<plan:string>
+-- !query output
+== Physical Plan ==
+ObjectHashAggregate (5)
++- Exchange (4)
+   +- ObjectHashAggregate (3)
+      +- * ColumnarToRow (2)
+         +- Scan parquet default.explain_temp4 (1)
+
+
+(1) Scan parquet default.explain_temp4 
+Output: [key#x, val#x]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/explain_temp4]
+ReadSchema: struct<key:int,val:string>
+     
+(2) ColumnarToRow [codegen id : 1]
+Input: [key#x, val#x]
+     
+(3) ObjectHashAggregate 
+Input: [key#x, val#x]
+Keys: [key#x]
+Functions: [partial_collect_set(val#x, 0, 0)]
+Aggregate Attributes: [buf#x]
+Results: [key#x, buf#x]
+     
+(4) Exchange 
+Input: [key#x, buf#x]
+     
+(5) ObjectHashAggregate 
+Input: [key#x, buf#x]
+Keys: [key#x]
+Functions: [collect_set(val#x, 0, 0)]
+Aggregate Attributes: [collect_set(val#x, 0, 0)#x]
+Results: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x]
+
+
+-- !query
+EXPLAIN FORMATTED
+  SELECT key, MIN(val)
+  FROM explain_temp4
+  GROUP BY key
+-- !query schema
+struct<plan:string>
+-- !query output
+== Physical Plan ==
+SortAggregate (7)
++- * Sort (6)
+   +- Exchange (5)
+      +- SortAggregate (4)
+         +- * Sort (3)
+            +- * ColumnarToRow (2)
+               +- Scan parquet default.explain_temp4 (1)
+
+
+(1) Scan parquet default.explain_temp4 
+Output: [key#x, val#x]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/explain_temp4]
+ReadSchema: struct<key:int,val:string>
+     
+(2) ColumnarToRow [codegen id : 1]
+Input: [key#x, val#x]
+     
+(3) Sort [codegen id : 1]
+Input: [key#x, val#x]
+     
+(4) SortAggregate 
+Input: [key#x, val#x]
+Keys: [key#x]
+Functions: [partial_min(val#x)]
+Aggregate Attributes: [min#x]
+Results: [key#x, min#x]
+     
+(5) Exchange 
+Input: [key#x, min#x]
+     
+(6) Sort [codegen id : 2]
+Input: [key#x, min#x]
+     
+(7) SortAggregate 
+Input: [key#x, min#x]
+Keys: [key#x]
+Functions: [min(val#x)]
+Aggregate Attributes: [min(val#x)#x]
+Results: [key#x, min(val#x)#x AS min(val)#x]
+
+
 -- !query
 DROP TABLE explain_temp1
 -- !query schema

From aa0d13683cdf9f38f04cc0e73dc8cf63eed29bf4 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Thu, 13 Feb 2020 02:31:48 +0800
Subject: [PATCH 027/185] [SPARK-30760][SQL] Port `millisToDays` and
 `daysToMillis` on Java 8 time API

### What changes were proposed in this pull request?
In the PR, I propose to rewrite the `millisToDays` and `daysToMillis` of `DateTimeUtils` using Java 8 time API.

I removed `getOffsetFromLocalMillis` from `DateTimeUtils` because it is a private methods, and is not used anymore in Spark SQL.

### Why are the changes needed?
New implementation is based on Proleptic Gregorian calendar which has been already used by other date-time functions. This changes make `millisToDays` and `daysToMillis` consistent to rest Spark SQL API related to date & time operations.

### Does this PR introduce any user-facing change?
Yes, this might effect behavior for old dates before 1582 year.

### How was this patch tested?
By existing test suites `DateTimeUtilsSuite`, `DateFunctionsSuite`, DateExpressionsSuite`, `SQLQuerySuite` and `HiveResultSuite`.

Closes #27494 from MaxGekk/millis-2-days-java8-api.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/datetimeExpressions.scala     |  8 +--
 .../sql/catalyst/util/DateTimeUtils.scala     | 58 +++++--------------
 .../catalyst/csv/UnivocityParserSuite.scala   |  3 +-
 .../expressions/DateExpressionsSuite.scala    | 19 +++---
 .../catalyst/util/DateTimeUtilsSuite.scala    | 34 ++++++-----
 .../spark/sql/execution/HiveResult.scala      |  5 ++
 .../sql-tests/results/postgreSQL/date.sql.out | 12 ++--
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  1 +
 8 files changed, 62 insertions(+), 78 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 1f4c8c041f8bf..cf91489d8e6b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -135,7 +135,7 @@ case class CurrentBatchTimestamp(
   def toLiteral: Literal = dataType match {
     case _: TimestampType =>
       Literal(DateTimeUtils.fromJavaTimestamp(new Timestamp(timestampMs)), TimestampType)
-    case _: DateType => Literal(DateTimeUtils.millisToDays(timestampMs, timeZone), DateType)
+    case _: DateType => Literal(DateTimeUtils.millisToDays(timestampMs, zoneId), DateType)
   }
 }
 
@@ -1332,14 +1332,14 @@ case class MonthsBetween(
 
   override def nullSafeEval(t1: Any, t2: Any, roundOff: Any): Any = {
     DateTimeUtils.monthsBetween(
-      t1.asInstanceOf[Long], t2.asInstanceOf[Long], roundOff.asInstanceOf[Boolean], timeZone)
+      t1.asInstanceOf[Long], t2.asInstanceOf[Long], roundOff.asInstanceOf[Boolean], zoneId)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, (d1, d2, roundOff) => {
-      s"""$dtu.monthsBetween($d1, $d2, $roundOff, $tz)"""
+      s"""$dtu.monthsBetween($d1, $d2, $roundOff, $zid)"""
     })
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index eeae0674166bc..5976bcbb52fd7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -67,24 +67,22 @@ object DateTimeUtils {
 
   // we should use the exact day as Int, for example, (year, month, day) -> day
   def millisToDays(millisUtc: Long): SQLDate = {
-    millisToDays(millisUtc, defaultTimeZone())
+    millisToDays(millisUtc, defaultTimeZone().toZoneId)
   }
 
-  def millisToDays(millisUtc: Long, timeZone: TimeZone): SQLDate = {
-    // SPARK-6785: use Math.floorDiv so negative number of days (dates before 1970)
-    // will correctly work as input for function toJavaDate(Int)
-    val millisLocal = millisUtc + timeZone.getOffset(millisUtc)
-    Math.floorDiv(millisLocal, MILLIS_PER_DAY).toInt
+  def millisToDays(millisUtc: Long, zoneId: ZoneId): SQLDate = {
+    val instant = microsToInstant(Math.multiplyExact(millisUtc, MICROS_PER_MILLIS))
+    localDateToDays(LocalDateTime.ofInstant(instant, zoneId).toLocalDate)
   }
 
   // reverse of millisToDays
   def daysToMillis(days: SQLDate): Long = {
-    daysToMillis(days, defaultTimeZone())
+    daysToMillis(days, defaultTimeZone().toZoneId)
   }
 
-  def daysToMillis(days: SQLDate, timeZone: TimeZone): Long = {
-    val millisLocal = days.toLong * MILLIS_PER_DAY
-    millisLocal - getOffsetFromLocalMillis(millisLocal, timeZone)
+  def daysToMillis(days: SQLDate, zoneId: ZoneId): Long = {
+    val instant = daysToLocalDate(days).atStartOfDay(zoneId).toInstant
+    instantToMicros(instant) / MICROS_PER_MILLIS
   }
 
   // Converts Timestamp to string according to Hive TimestampWritable convention.
@@ -589,11 +587,11 @@ object DateTimeUtils {
       time1: SQLTimestamp,
       time2: SQLTimestamp,
       roundOff: Boolean,
-      timeZone: TimeZone): Double = {
+      zoneId: ZoneId): Double = {
     val millis1 = MICROSECONDS.toMillis(time1)
     val millis2 = MICROSECONDS.toMillis(time2)
-    val date1 = millisToDays(millis1, timeZone)
-    val date2 = millisToDays(millis2, timeZone)
+    val date1 = millisToDays(millis1, zoneId)
+    val date2 = millisToDays(millis2, zoneId)
     val (year1, monthInYear1, dayInMonth1, daysToMonthEnd1) = splitDate(date1)
     val (year2, monthInYear2, dayInMonth2, daysToMonthEnd2) = splitDate(date2)
 
@@ -607,8 +605,8 @@ object DateTimeUtils {
     }
     // using milliseconds can cause precision loss with more than 8 digits
     // we follow Hive's implementation which uses seconds
-    val secondsInDay1 = MILLISECONDS.toSeconds(millis1 - daysToMillis(date1, timeZone))
-    val secondsInDay2 = MILLISECONDS.toSeconds(millis2 - daysToMillis(date2, timeZone))
+    val secondsInDay1 = MILLISECONDS.toSeconds(millis1 - daysToMillis(date1, zoneId))
+    val secondsInDay2 = MILLISECONDS.toSeconds(millis2 - daysToMillis(date2, zoneId))
     val secondsDiff = (dayInMonth1 - dayInMonth2) * SECONDS_PER_DAY + secondsInDay1 - secondsInDay2
     val secondsInMonth = DAYS.toSeconds(31)
     val diff = monthDiff + secondsDiff / secondsInMonth.toDouble
@@ -737,8 +735,8 @@ object DateTimeUtils {
         millis += offset
         millis - millis % MILLIS_PER_DAY - offset
       case _ => // Try to truncate date levels
-        val dDays = millisToDays(millis, timeZone)
-        daysToMillis(truncDate(dDays, level), timeZone)
+        val dDays = millisToDays(millis, timeZone.toZoneId)
+        daysToMillis(truncDate(dDays, level), timeZone.toZoneId)
     }
     truncated * MICROS_PER_MILLIS
   }
@@ -770,32 +768,6 @@ object DateTimeUtils {
     }
   }
 
-  /**
-   * Lookup the offset for given millis seconds since 1970-01-01 00:00:00 in given timezone.
-   * TODO: Improve handling of normalization differences.
-   * TODO: Replace with JSR-310 or similar system - see SPARK-16788
-   */
-  private[sql] def getOffsetFromLocalMillis(millisLocal: Long, tz: TimeZone): Long = {
-    var guess = tz.getRawOffset
-    // the actual offset should be calculated based on milliseconds in UTC
-    val offset = tz.getOffset(millisLocal - guess)
-    if (offset != guess) {
-      guess = tz.getOffset(millisLocal - offset)
-      if (guess != offset) {
-        // fallback to do the reverse lookup using java.time.LocalDateTime
-        // this should only happen near the start or end of DST
-        val localDate = LocalDate.ofEpochDay(MILLISECONDS.toDays(millisLocal))
-        val localTime = LocalTime.ofNanoOfDay(MILLISECONDS.toNanos(
-          Math.floorMod(millisLocal, MILLIS_PER_DAY)))
-        val localDateTime = LocalDateTime.of(localDate, localTime)
-        val millisEpoch = localDateTime.atZone(tz.toZoneId).toInstant.toEpochMilli
-
-        guess = (millisLocal - millisEpoch).toInt
-      }
-    }
-    guess
-  }
-
   /**
    * Convert the timestamp `ts` from one timezone to another.
    *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
index 77a2ca7e4a828..536c76f042d23 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.csv
 
 import java.math.BigDecimal
 import java.text.{DecimalFormat, DecimalFormatSymbols}
+import java.time.ZoneOffset
 import java.util.{Locale, TimeZone}
 
 import org.apache.commons.lang3.time.FastDateFormat
@@ -137,7 +138,7 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
     val expectedDate = format.parse(customDate).getTime
     val castedDate = parser.makeConverter("_1", DateType, nullable = true)
         .apply(customDate)
-    assert(castedDate == DateTimeUtils.millisToDays(expectedDate, TimeZone.getTimeZone("GMT")))
+    assert(castedDate == DateTimeUtils.millisToDays(expectedDate, ZoneOffset.UTC))
 
     val timestamp = "2015-01-01 00:00:00"
     timestampsOptions = new CSVOptions(Map(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index f04149ab7eb29..39b859af47ca9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -56,9 +56,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   val ts = new Timestamp(toMillis(time))
 
   test("datetime function current_date") {
-    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis(), TimeZoneGMT)
+    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis(), ZoneOffset.UTC)
     val cd = CurrentDate(gmtId).eval(EmptyRow).asInstanceOf[Int]
-    val d1 = DateTimeUtils.millisToDays(System.currentTimeMillis(), TimeZoneGMT)
+    val d1 = DateTimeUtils.millisToDays(System.currentTimeMillis(), ZoneOffset.UTC)
     assert(d0 <= cd && cd <= d1 && d1 - d0 <= 1)
 
     val cdjst = CurrentDate(jstId).eval(EmptyRow).asInstanceOf[Int]
@@ -499,7 +499,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Valid range of DateType is [0001-01-01, 9999-12-31]
     val maxMonthInterval = 10000 * 12
     checkEvaluation(
-      AddMonths(Literal(Date.valueOf("0001-01-01")), Literal(maxMonthInterval)), 2933261)
+      AddMonths(Literal(LocalDate.parse("0001-01-01")), Literal(maxMonthInterval)),
+      LocalDate.of(10001, 1, 1).toEpochDay.toInt)
     checkEvaluation(
       AddMonths(Literal(Date.valueOf("9999-12-31")), Literal(-1 * maxMonthInterval)), -719529)
     // Test evaluation results between Interpreted mode and Codegen mode
@@ -788,7 +789,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(
               UnixTimestamp(Literal(date1), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
               MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               UnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))),
                 Literal(fmt2), timeZoneId),
@@ -796,7 +797,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(UnixTimestamp(
               Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
               MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
-                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
+                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz.toZoneId)))
             val t1 = UnixTimestamp(
               CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
             val t2 = UnixTimestamp(
@@ -814,7 +815,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(
               UnixTimestamp(Literal(date1), Literal.create(null, StringType), timeZoneId),
               MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format"), timeZoneId), null)
           }
@@ -852,7 +853,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(
               ToUnixTimestamp(Literal(date1), Literal(fmt1), timeZoneId),
               MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               ToUnixTimestamp(
                 Literal(sdf2.format(new Timestamp(-1000000))),
@@ -861,7 +862,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(ToUnixTimestamp(
               Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
               MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
-                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz)))
+                DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz.toZoneId)))
             val t1 = ToUnixTimestamp(
               CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
             val t2 = ToUnixTimestamp(
@@ -876,7 +877,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
             checkEvaluation(ToUnixTimestamp(
               Literal(date1), Literal.create(null, StringType), timeZoneId),
               MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz)))
+                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               ToUnixTimestamp(
                 Literal("2015-07-24"),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index cabcd3007d1c0..cd0594c775a47 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -86,9 +86,13 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {
+    def format(d: Date): String = {
+      TimestampFormatter("uuuu-MM-dd", defaultTimeZone().toZoneId)
+        .format(d.getTime * MICROS_PER_MILLIS)
+    }
     def checkFromToJavaDate(d1: Date): Unit = {
       val d2 = toJavaDate(fromJavaDate(d1))
-      assert(d2.toString === d1.toString)
+      assert(format(d2) === format(d1))
     }
 
     val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
@@ -413,22 +417,22 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
   test("monthsBetween") {
     val date1 = date(1997, 2, 28, 10, 30, 0)
     var date2 = date(1996, 10, 30)
-    assert(monthsBetween(date1, date2, true, TimeZoneUTC) === 3.94959677)
-    assert(monthsBetween(date1, date2, false, TimeZoneUTC) === 3.9495967741935485)
+    assert(monthsBetween(date1, date2, true, ZoneOffset.UTC) === 3.94959677)
+    assert(monthsBetween(date1, date2, false, ZoneOffset.UTC) === 3.9495967741935485)
     Seq(true, false).foreach { roundOff =>
       date2 = date(2000, 2, 28)
-      assert(monthsBetween(date1, date2, roundOff, TimeZoneUTC) === -36)
+      assert(monthsBetween(date1, date2, roundOff, ZoneOffset.UTC) === -36)
       date2 = date(2000, 2, 29)
-      assert(monthsBetween(date1, date2, roundOff, TimeZoneUTC) === -36)
+      assert(monthsBetween(date1, date2, roundOff, ZoneOffset.UTC) === -36)
       date2 = date(1996, 3, 31)
-      assert(monthsBetween(date1, date2, roundOff, TimeZoneUTC) === 11)
+      assert(monthsBetween(date1, date2, roundOff, ZoneOffset.UTC) === 11)
     }
 
     val date3 = date(2000, 2, 28, 16, tz = TimeZonePST)
     val date4 = date(1997, 2, 28, 16, tz = TimeZonePST)
-    assert(monthsBetween(date3, date4, true, TimeZonePST) === 36.0)
-    assert(monthsBetween(date3, date4, true, TimeZoneGMT) === 35.90322581)
-    assert(monthsBetween(date3, date4, false, TimeZoneGMT) === 35.903225806451616)
+    assert(monthsBetween(date3, date4, true, TimeZonePST.toZoneId) === 36.0)
+    assert(monthsBetween(date3, date4, true, ZoneOffset.UTC) === 35.90322581)
+    assert(monthsBetween(date3, date4, false, ZoneOffset.UTC) === 35.903225806451616)
   }
 
   test("from UTC timestamp") {
@@ -571,15 +575,15 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
 
   test("daysToMillis and millisToDays") {
     val input = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, 16, tz = TimeZonePST))
-    assert(millisToDays(input, TimeZonePST) === 16800)
-    assert(millisToDays(input, TimeZoneGMT) === 16801)
-    assert(millisToDays(-1 * MILLIS_PER_DAY + 1, TimeZoneGMT) == -1)
+    assert(millisToDays(input, TimeZonePST.toZoneId) === 16800)
+    assert(millisToDays(input, ZoneOffset.UTC) === 16801)
+    assert(millisToDays(-1 * MILLIS_PER_DAY + 1, ZoneOffset.UTC) == -1)
 
     var expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, tz = TimeZonePST))
-    assert(daysToMillis(16800, TimeZonePST) === expected)
+    assert(daysToMillis(16800, TimeZonePST.toZoneId) === expected)
 
     expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, tz = TimeZoneGMT))
-    assert(daysToMillis(16800, TimeZoneGMT) === expected)
+    assert(daysToMillis(16800, ZoneOffset.UTC) === expected)
 
     // There are some days are skipped entirely in some timezone, skip them here.
     val skipped_days = Map[String, Set[Int]](
@@ -594,7 +598,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       val skipped = skipped_days.getOrElse(tz.getID, Set.empty)
       (-20000 to 20000).foreach { d =>
         if (!skipped.contains(d)) {
-          assert(millisToDays(daysToMillis(d, tz), tz) === d,
+          assert(millisToDays(daysToMillis(d, tz.toZoneId), tz.toZoneId) === d,
             s"Round trip of ${d} did not work in tz ${tz}")
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
index bbe47a63f4d61..5a2f16d8e1526 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
+import java.time.{Instant, LocalDate}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter}
@@ -67,8 +68,12 @@ object HiveResult {
     case (null, _) => if (nested) "null" else "NULL"
     case (b, BooleanType) => b.toString
     case (d: Date, DateType) => dateFormatter.format(DateTimeUtils.fromJavaDate(d))
+    case (ld: LocalDate, DateType) =>
+      dateFormatter.format(DateTimeUtils.localDateToDays(ld))
     case (t: Timestamp, TimestampType) =>
       timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t))
+    case (i: Instant, TimestampType) =>
+      timestampFormatter.format(DateTimeUtils.instantToMicros(i))
     case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8)
     case (decimal: java.math.BigDecimal, DecimalType()) => decimal.toPlainString
     case (n, _: NumericType) => n.toString
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out
index fd5dc42632176..ed27317121623 100755
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out
@@ -800,7 +800,7 @@ SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000')
 -- !query schema
 struct<date_trunc(MILLENNIUM, TIMESTAMP '1970-03-20 04:30:00'):timestamp>
 -- !query output
-1001-01-01 00:07:02
+1001-01-01 00:00:00
 
 
 -- !query
@@ -808,7 +808,7 @@ SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20')
 -- !query schema
 struct<date_trunc(MILLENNIUM, CAST(DATE '1970-03-20' AS TIMESTAMP)):timestamp>
 -- !query output
-1001-01-01 00:07:02
+1001-01-01 00:00:00
 
 
 -- !query
@@ -840,7 +840,7 @@ SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04')
 -- !query schema
 struct<date_trunc(CENTURY, CAST(DATE '0002-02-04' AS TIMESTAMP)):timestamp>
 -- !query output
-0001-01-01 00:07:02
+0001-01-01 00:00:00
 
 
 -- !query
@@ -848,7 +848,7 @@ SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G'))
 -- !query schema
 struct<date_trunc(CENTURY, CAST(to_date('0055-08-10 BC', 'yyyy-MM-dd G') AS TIMESTAMP)):timestamp>
 -- !query output
--0099-01-01 00:07:02
+-0099-01-01 00:00:00
 
 
 -- !query
@@ -864,7 +864,7 @@ SELECT DATE_TRUNC('DECADE', DATE '0004-12-25')
 -- !query schema
 struct<date_trunc(DECADE, CAST(DATE '0004-12-25' AS TIMESTAMP)):timestamp>
 -- !query output
-0000-01-01 00:07:02
+0000-01-01 00:00:00
 
 
 -- !query
@@ -872,7 +872,7 @@ SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G'))
 -- !query schema
 struct<date_trunc(DECADE, CAST(to_date('0002-12-31 BC', 'yyyy-MM-dd G') AS TIMESTAMP)):timestamp>
 -- !query output
--0010-01-01 00:07:02
+-0010-01-01 00:00:00
 
 
 -- !query
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 2e5a9e0b4d45d..6b9e5bbd3c961 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -337,6 +337,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
         localSparkSession.conf.set(SQLConf.ANSI_ENABLED.key, true)
       case _ =>
     }
+    localSparkSession.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, true)
 
     if (configSet.nonEmpty) {
       // Execute the list of set operation in order to add the desired configs

From 5b76367a9d0aaca53ce96ab7e555a596567e8335 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <liangchi@uber.com>
Date: Wed, 12 Feb 2020 14:27:18 -0800
Subject: [PATCH 028/185] [SPARK-30797][SQL] Set tradition user/group/other
 permission to ACL entries when setting up ACLs in truncate table

### What changes were proposed in this pull request?

This is a follow-up to the PR #26956. In #26956, the patch proposed to preserve path permission when truncating table. When setting up original ACLs, we need to set user/group/other permission as ACL entries too, otherwise if the path doesn't have default user/group/other ACL entries, ACL API will complain an error `Invalid ACL: the user, group and other entries are required.`.

 In short this change makes sure:

1. Permissions for user/group/other are always kept into ACLs to work with ACL API.
2. Other custom ACLs are still kept after TRUNCATE TABLE (#26956 did this).

### Why are the changes needed?

Without this fix, `TRUNCATE TABLE` will get an error when setting up ACLs if there is no default default user/group/other ACL entries.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Update unit test.

Manual test on dev Spark cluster.

Set ACLs for a table path without default user/group/other ACL entries:
```
hdfs dfs -setfacl --set 'user:liangchi:rwx,user::rwx,group::r--,other::r--' /user/hive/warehouse/test.db/test_truncate_table

hdfs dfs -getfacl /user/hive/warehouse/test.db/test_truncate_table
# file: /user/hive/warehouse/test.db/test_truncate_table
# owner: liangchi
# group: supergroup
user::rwx
user:liangchi:rwx
group::r--
mask::rwx
other::r--
```
Then run `sql("truncate table test.test_truncate_table")`, it works by normally truncating the table and preserve ACLs.

Closes #27548 from viirya/fix-truncate-table-permission.

Lead-authored-by: Liang-Chi Hsieh <liangchi@uber.com>
Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/execution/command/tables.scala  | 32 +++++++++++++++++--
 .../sql/execution/command/DDLSuite.scala      | 21 +++++++++++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 90dbdf5515d4d..61500b773cd7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -19,12 +19,13 @@ package org.apache.spark.sql.execution.command
 
 import java.net.{URI, URISyntaxException}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileContext, FsConstants, Path}
-import org.apache.hadoop.fs.permission.{AclEntry, FsPermission}
+import org.apache.hadoop.fs.permission.{AclEntry, AclEntryScope, AclEntryType, FsAction, FsPermission}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
@@ -538,12 +539,27 @@ case class TruncateTableCommand(
               }
             }
             optAcls.foreach { acls =>
+              val aclEntries = acls.asScala.filter(_.getName != null).asJava
+
+              // If the path doesn't have default ACLs, `setAcl` API will throw an error
+              // as it expects user/group/other permissions must be in ACL entries.
+              // So we need to add tradition user/group/other permission
+              // in the form of ACL.
+              optPermission.map { permission =>
+                aclEntries.add(newAclEntry(AclEntryScope.ACCESS,
+                  AclEntryType.USER, permission.getUserAction()))
+                aclEntries.add(newAclEntry(AclEntryScope.ACCESS,
+                  AclEntryType.GROUP, permission.getGroupAction()))
+                aclEntries.add(newAclEntry(AclEntryScope.ACCESS,
+                  AclEntryType.OTHER, permission.getOtherAction()))
+              }
+
               try {
-                fs.setAcl(path, acls)
+                fs.setAcl(path, aclEntries)
               } catch {
                 case NonFatal(e) =>
                   throw new SecurityException(
-                    s"Failed to set original ACL $acls back to " +
+                    s"Failed to set original ACL $aclEntries back to " +
                       s"the created path: $path. Exception: ${e.getMessage}")
               }
             }
@@ -574,6 +590,16 @@ case class TruncateTableCommand(
     }
     Seq.empty[Row]
   }
+
+  private def newAclEntry(
+      scope: AclEntryScope,
+      aclType: AclEntryType,
+      permission: FsAction): AclEntry = {
+    new AclEntry.Builder()
+      .setScope(scope)
+      .setType(aclType)
+      .setPermission(permission).build()
+  }
 }
 
 abstract class DescribeCommandBase extends RunnableCommand {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 31e00781ae6b4..dbf4b09403423 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2042,6 +2042,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
           // Set ACL to table path.
           val customAcl = new java.util.ArrayList[AclEntry]()
           customAcl.add(new AclEntry.Builder()
+            .setName("test")
             .setType(AclEntryType.USER)
             .setScope(AclEntryScope.ACCESS)
             .setPermission(FsAction.READ).build())
@@ -2061,8 +2062,26 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
           if (ignore) {
             assert(aclEntries.size() == 0)
           } else {
-            assert(aclEntries.size() == 1)
+            assert(aclEntries.size() == 4)
             assert(aclEntries.get(0) == customAcl.get(0))
+
+            // Setting ACLs will also set user/group/other permissions
+            // as ACL entries.
+            val user = new AclEntry.Builder()
+              .setType(AclEntryType.USER)
+              .setScope(AclEntryScope.ACCESS)
+              .setPermission(FsAction.ALL).build()
+            val group = new AclEntry.Builder()
+              .setType(AclEntryType.GROUP)
+              .setScope(AclEntryScope.ACCESS)
+              .setPermission(FsAction.ALL).build()
+            val other = new AclEntry.Builder()
+              .setType(AclEntryType.OTHER)
+              .setScope(AclEntryScope.ACCESS)
+              .setPermission(FsAction.ALL).build()
+            assert(aclEntries.get(1) == user)
+            assert(aclEntries.get(2) == group)
+            assert(aclEntries.get(3) == other)
           }
         }
       }

From 496f6ac86001d284cbfb7488a63dd3a168919c0f Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@nvidia.com>
Date: Wed, 12 Feb 2020 16:45:42 -0600
Subject: [PATCH 029/185] [SPARK-29148][CORE] Add stage level scheduling
 dynamic allocation and scheduler backend changes

### What changes were proposed in this pull request?

This is another PR for stage level scheduling. In particular this adds changes to the dynamic allocation manager and the scheduler backend to be able to track what executors are needed per ResourceProfile.  Note the api is still private to Spark until the entire feature gets in, so this functionality will be there but only usable by tests for profiles other then the DefaultProfile.

The main changes here are simply tracking things on a ResourceProfile basis as well as sending the executor requests to the scheduler backend for all ResourceProfiles.

I introduce a ResourceProfileManager in this PR that will track all the actual ResourceProfile objects so that we can keep them all in a single place and just pass around and use in datastructures the resource profile id. The resource profile id can be used with the ResourceProfileManager to get the actual ResourceProfile contents.

There are various places in the code that use executor "slots" for things.  The ResourceProfile adds functionality to keep that calculation in it.   This logic is more complex then it should due to standalone mode and mesos coarse grained not setting the executor cores config. They default to all cores on the worker, so calculating slots is harder there.
This PR keeps the functionality to make the cores the limiting resource because the scheduler still uses that for "slots" for a few things.

This PR does also add the resource profile id to the Stage and stage info classes to be able to test things easier.   That full set of changes will come with the scheduler PR that will be after this one.

The PR stops at the scheduler backend pieces for the cluster manager and the real YARN support hasn't been added in this PR, that again will be in a separate PR, so this has a few of the API changes up to the cluster manager and then just uses the default profile requests to continue.

The code for the entire feature is here for reference: https://github.com/apache/spark/pull/27053/files although it needs to be upmerged again as well.

### Why are the changes needed?

Needed for stage level scheduling feature.

### Does this PR introduce any user-facing change?

No user facing api changes added here.

### How was this patch tested?

Lots of unit tests and manually testing. I tested on yarn, k8s, standalone, local modes. Ran both failure and success cases.

Closes #27313 from tgravescs/SPARK-29148.

Authored-by: Thomas Graves <tgraves@nvidia.com>
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/ExecutorAllocationClient.scala      |   31 +-
 .../spark/ExecutorAllocationManager.scala     |  473 +++++---
 .../scala/org/apache/spark/SparkContext.scala |  150 +--
 .../apache/spark/internal/config/Tests.scala  |    9 +
 .../resource/ExecutorResourceRequests.scala   |    2 +-
 .../spark/resource/ResourceProfile.scala      |  150 ++-
 .../resource/ResourceProfileBuilder.scala     |    2 +-
 .../resource/ResourceProfileManager.scala     |   86 ++
 .../apache/spark/resource/ResourceUtils.scala |  109 +-
 .../apache/spark/scheduler/DAGScheduler.scala |    9 +-
 .../apache/spark/scheduler/ResultStage.scala  |    5 +-
 .../spark/scheduler/ShuffleMapStage.scala     |    5 +-
 .../org/apache/spark/scheduler/Stage.scala    |    9 +-
 .../apache/spark/scheduler/StageInfo.scala    |    9 +-
 .../spark/scheduler/TaskSchedulerImpl.scala   |    4 +-
 .../CoarseGrainedSchedulerBackend.scala       |  150 +--
 .../cluster/StandaloneSchedulerBackend.scala  |   11 +-
 .../scheduler/dynalloc/ExecutorMonitor.scala  |   11 +-
 .../org/apache/spark/util/JsonProtocol.scala  |    8 +-
 .../ExecutorAllocationManagerSuite.scala      | 1049 +++++++++++------
 .../apache/spark/HeartbeatReceiverSuite.scala |    9 +-
 .../org/apache/spark/LocalSparkContext.scala  |    2 +-
 .../org/apache/spark/SparkContextSuite.scala  |   36 +-
 .../BasicEventFilterBuilderSuite.scala        |    4 +-
 .../ResourceProfileManagerSuite.scala         |  103 ++
 .../spark/resource/ResourceProfileSuite.scala |   79 +-
 .../spark/resource/ResourceUtilsSuite.scala   |    3 +
 .../CoarseGrainedSchedulerBackendSuite.scala  |   13 +-
 .../scheduler/EventLoggingListenerSuite.scala |    7 +-
 .../dynalloc/ExecutorMonitorSuite.scala       |   19 +-
 .../spark/status/AppStatusListenerSuite.scala |   76 +-
 .../status/ListenerEventsTestHelper.scala     |   10 +-
 .../org/apache/spark/ui/StagePageSuite.scala  |    4 +-
 .../apache/spark/util/JsonProtocolSuite.scala |   15 +-
 python/pyspark/tests/test_context.py          |    5 +
 python/pyspark/tests/test_taskcontext.py      |    6 +
 .../KubernetesClusterSchedulerBackend.scala   |    8 +-
 ...bernetesClusterSchedulerBackendSuite.scala |    4 +
 .../MesosCoarseGrainedSchedulerBackend.scala  |   18 +-
 ...osCoarseGrainedSchedulerBackendSuite.scala |   26 +-
 .../cluster/YarnSchedulerBackend.scala        |   25 +-
 .../cluster/YarnSchedulerBackendSuite.scala   |    7 +-
 .../ui/MetricsAggregationBenchmark.scala      |    4 +-
 .../ui/SQLAppStatusListenerSuite.scala        |    4 +-
 .../scheduler/ExecutorAllocationManager.scala |    7 +-
 .../ExecutorAllocationManagerSuite.scala      |   19 +-
 46 files changed, 1935 insertions(+), 860 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
index cb965cb180207..00bd0063c9e3a 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -37,24 +37,29 @@ private[spark] trait ExecutorAllocationClient {
   /**
    * Update the cluster manager on our scheduling needs. Three bits of information are included
    * to help it make decisions.
-   * @param numExecutors The total number of executors we'd like to have. The cluster manager
-   *                     shouldn't kill any running executor to reach this number, but,
-   *                     if all existing executors were to die, this is the number of executors
-   *                     we'd want to be allocated.
-   * @param localityAwareTasks The number of tasks in all active stages that have a locality
-   *                           preferences. This includes running, pending, and completed tasks.
-   * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
-   *                             that would like to like to run on that host.
-   *                             This includes running, pending, and completed tasks.
+   *
+   * @param resourceProfileIdToNumExecutors The total number of executors we'd like to have per
+   *                                        ResourceProfile id. The cluster manager shouldn't kill
+   *                                        any running executor to reach this number, but, if all
+   *                                        existing executors were to die, this is the number
+   *                                        of executors we'd want to be allocated.
+   * @param numLocalityAwareTasksPerResourceProfileId The number of tasks in all active stages that
+   *                                                  have a locality preferences per
+   *                                                  ResourceProfile id. This includes running,
+   *                                                  pending, and completed tasks.
+   * @param hostToLocalTaskCount A map of ResourceProfile id to a map of hosts to the number of
+   *                             tasks from all active stages that would like to like to run on
+   *                             that host. This includes running, pending, and completed tasks.
    * @return whether the request is acknowledged by the cluster manager.
    */
   private[spark] def requestTotalExecutors(
-      numExecutors: Int,
-      localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int]): Boolean
+      resourceProfileIdToNumExecutors: Map[Int, Int],
+      numLocalityAwareTasksPerResourceProfileId: Map[Int, Int],
+      hostToLocalTaskCount: Map[Int, Map[String, Int]]): Boolean
 
   /**
-   * Request an additional number of executors from the cluster manager.
+   * Request an additional number of executors from the cluster manager for the default
+   * ResourceProfile.
    * @return whether the request is acknowledged by the cluster manager.
    */
   def requestExecutors(numAdditionalExecutors: Int): Boolean
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 677386cc7a572..5cb3160711a90 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -29,6 +29,8 @@ import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL
 import org.apache.spark.metrics.source.Source
+import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID
+import org.apache.spark.resource.ResourceProfileManager
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.dynalloc.ExecutorMonitor
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
@@ -36,9 +38,9 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 /**
  * An agent that dynamically allocates and removes executors based on the workload.
  *
- * The ExecutorAllocationManager maintains a moving target number of executors which is periodically
- * synced to the cluster manager. The target starts at a configured initial value and changes with
- * the number of pending and running tasks.
+ * The ExecutorAllocationManager maintains a moving target number of executors, for each
+ * ResourceProfile, which is periodically synced to the cluster manager. The target starts
+ * at a configured initial value and changes with the number of pending and running tasks.
  *
  * Decreasing the target number of executors happens when the current target is more than needed to
  * handle the current load. The target number of executors is always truncated to the number of
@@ -57,14 +59,18 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
  * quickly over time in case the maximum number of executors is very high. Otherwise, it will take
  * a long time to ramp up under heavy workloads.
  *
- * The remove policy is simpler: If an executor has been idle for K seconds, meaning it has not
- * been scheduled to run any tasks, then it is removed. Note that an executor caching any data
+ * The remove policy is simpler and is applied on each ResourceProfile separately. If an executor
+ * for that ResourceProfile has been idle for K seconds and the number of executors is more
+ * then what is needed for that ResourceProfile, meaning there are not enough tasks that could use
+ * the executor, then it is removed. Note that an executor caching any data
  * blocks will be removed if it has been idle for more than L seconds.
  *
  * There is no retry logic in either case because we make the assumption that the cluster manager
  * will eventually fulfill all requests it receives asynchronously.
  *
- * The relevant Spark properties include the following:
+ * The relevant Spark properties are below. Each of these properties applies separately to
+ * every ResourceProfile. So if you set a minimum number of executors, that is a minimum
+ * for each ResourceProfile.
  *
  *   spark.dynamicAllocation.enabled - Whether this feature is enabled
  *   spark.dynamicAllocation.minExecutors - Lower bound on the number of executors
@@ -95,7 +101,8 @@ private[spark] class ExecutorAllocationManager(
     listenerBus: LiveListenerBus,
     conf: SparkConf,
     cleaner: Option[ContextCleaner] = None,
-    clock: Clock = new SystemClock())
+    clock: Clock = new SystemClock(),
+    resourceProfileManager: ResourceProfileManager)
   extends Logging {
 
   allocationManager =>
@@ -117,23 +124,23 @@ private[spark] class ExecutorAllocationManager(
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.get(DYN_ALLOCATION_TESTING)
 
-  // TODO: The default value of 1 for spark.executor.cores works right now because dynamic
-  // allocation is only supported for YARN and the default number of cores per executor in YARN is
-  // 1, but it might need to be attained differently for different cluster managers
-  private val tasksPerExecutorForFullParallelism =
-    conf.get(EXECUTOR_CORES) / conf.get(CPUS_PER_TASK)
-
   private val executorAllocationRatio =
     conf.get(DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO)
 
+  private val defaultProfileId = resourceProfileManager.defaultResourceProfile.id
+
   validateSettings()
 
-  // Number of executors to add in the next round
-  private var numExecutorsToAdd = 1
+  // Number of executors to add for each ResourceProfile in the next round
+  private val numExecutorsToAddPerResourceProfileId = new mutable.HashMap[Int, Int]
+  numExecutorsToAddPerResourceProfileId(defaultProfileId) = 1
 
   // The desired number of executors at this moment in time. If all our executors were to die, this
   // is the number of executors we would immediately want from the cluster manager.
-  private var numExecutorsTarget = initialNumExecutors
+  // Note every profile will be allowed to have initial number,
+  // we may want to make this configurable per Profile in the future
+  private val numExecutorsTargetPerResourceProfileId = new mutable.HashMap[Int, Int]
+  numExecutorsTargetPerResourceProfileId(defaultProfileId) = initialNumExecutors
 
   // A timestamp of when an addition should be triggered, or NOT_SET if it is not set
   // This is set when pending tasks are added but not scheduled yet
@@ -165,11 +172,12 @@ private[spark] class ExecutorAllocationManager(
   //   (2) an executor idle timeout has elapsed.
   @volatile private var initializing: Boolean = true
 
-  // Number of locality aware tasks, used for executor placement.
-  private var localityAwareTasks = 0
+  // Number of locality aware tasks for each ResourceProfile, used for executor placement.
+  private var numLocalityAwareTasksPerResourceProfileId = new mutable.HashMap[Int, Int]
+  numLocalityAwareTasksPerResourceProfileId(defaultProfileId) = 0
 
-  // Host to possible task running on it, used for executor placement.
-  private var hostToLocalTaskCount: Map[String, Int] = Map.empty
+  // ResourceProfile id to Host to possible task running on it, used for executor placement.
+  private var rpIdToHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty
 
   /**
    * Verify that the settings specified through the config are valid.
@@ -233,7 +241,14 @@ private[spark] class ExecutorAllocationManager(
     }
     executor.scheduleWithFixedDelay(scheduleTask, 0, intervalMillis, TimeUnit.MILLISECONDS)
 
-    client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    // copy the maps inside synchonize to ensure not being modified
+    val (numExecutorsTarget, numLocalityAware) = synchronized {
+      val numTarget = numExecutorsTargetPerResourceProfileId.toMap
+      val numLocality = numLocalityAwareTasksPerResourceProfileId.toMap
+      (numTarget, numLocality)
+    }
+
+    client.requestTotalExecutors(numExecutorsTarget, numLocalityAware, rpIdToHostToLocalTaskCount)
   }
 
   /**
@@ -253,20 +268,28 @@ private[spark] class ExecutorAllocationManager(
    */
   def reset(): Unit = synchronized {
     addTime = 0L
-    numExecutorsTarget = initialNumExecutors
+    numExecutorsTargetPerResourceProfileId.keys.foreach { rpId =>
+      numExecutorsTargetPerResourceProfileId(rpId) = initialNumExecutors
+    }
     executorMonitor.reset()
   }
 
   /**
-   * The maximum number of executors we would need under the current load to satisfy all running
-   * and pending tasks, rounded up.
+   * The maximum number of executors, for the ResourceProfile id passed in, that we would need
+   * under the current load to satisfy all running and pending tasks, rounded up.
    */
-  private def maxNumExecutorsNeeded(): Int = {
-    val numRunningOrPendingTasks = listener.totalPendingTasks + listener.totalRunningTasks
+  private def maxNumExecutorsNeededPerResourceProfile(rpId: Int): Int = {
+    val pending = listener.totalPendingTasksPerResourceProfile(rpId)
+    val pendingSpeculative = listener.pendingSpeculativeTasksPerResourceProfile(rpId)
+    val running = listener.totalRunningTasksPerResourceProfile(rpId)
+    val numRunningOrPendingTasks = pending + running
+    val rp = resourceProfileManager.resourceProfileFromId(rpId)
+    val tasksPerExecutor = rp.maxTasksPerExecutor(conf)
+    logDebug(s"max needed for rpId: $rpId numpending: $numRunningOrPendingTasks," +
+      s" tasksperexecutor: $tasksPerExecutor")
     val maxNeeded = math.ceil(numRunningOrPendingTasks * executorAllocationRatio /
-      tasksPerExecutorForFullParallelism).toInt
-    if (tasksPerExecutorForFullParallelism > 1 && maxNeeded == 1 &&
-      listener.pendingSpeculativeTasks > 0) {
+      tasksPerExecutor).toInt
+    if (tasksPerExecutor > 1 && maxNeeded == 1 && pendingSpeculative > 0) {
       // If we have pending speculative tasks and only need a single executor, allocate one more
       // to satisfy the locality requirements of speculation
       maxNeeded + 1
@@ -275,8 +298,8 @@ private[spark] class ExecutorAllocationManager(
     }
   }
 
-  private def totalRunningTasks(): Int = synchronized {
-    listener.totalRunningTasks
+  private def totalRunningTasksPerResourceProfile(id: Int): Int = synchronized {
+    listener.totalRunningTasksPerResourceProfile(id)
   }
 
   /**
@@ -302,7 +325,8 @@ private[spark] class ExecutorAllocationManager(
   }
 
   /**
-   * Updates our target number of executors and syncs the result with the cluster manager.
+   * Updates our target number of executors for each ResourceProfile and then syncs the result
+   * with the cluster manager.
    *
    * Check to see whether our existing allocation and the requests we've made previously exceed our
    * current needs. If so, truncate our target and let the cluster manager know so that it can
@@ -314,130 +338,205 @@ private[spark] class ExecutorAllocationManager(
    * @return the delta in the target number of executors.
    */
   private def updateAndSyncNumExecutorsTarget(now: Long): Int = synchronized {
-    val maxNeeded = maxNumExecutorsNeeded
-
     if (initializing) {
       // Do not change our target while we are still initializing,
       // Otherwise the first job may have to ramp up unnecessarily
       0
-    } else if (maxNeeded < numExecutorsTarget) {
-      // The target number exceeds the number we actually need, so stop adding new
-      // executors and inform the cluster manager to cancel the extra pending requests
-      val oldNumExecutorsTarget = numExecutorsTarget
-      numExecutorsTarget = math.max(maxNeeded, minNumExecutors)
-      numExecutorsToAdd = 1
-
-      // If the new target has not changed, avoid sending a message to the cluster manager
-      if (numExecutorsTarget < oldNumExecutorsTarget) {
-        // We lower the target number of executors but don't actively kill any yet.  Killing is
-        // controlled separately by an idle timeout.  It's still helpful to reduce the target number
-        // in case an executor just happens to get lost (eg., bad hardware, or the cluster manager
-        // preempts it) -- in that case, there is no point in trying to immediately  get a new
-        // executor, since we wouldn't even use it yet.
-        client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
-        logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " +
-          s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
+    } else {
+      val updatesNeeded = new mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]
+
+      // Update targets for all ResourceProfiles then do a single request to the cluster manager
+      numExecutorsTargetPerResourceProfileId.foreach { case (rpId, targetExecs) =>
+        val maxNeeded = maxNumExecutorsNeededPerResourceProfile(rpId)
+        if (maxNeeded < targetExecs) {
+          // The target number exceeds the number we actually need, so stop adding new
+          // executors and inform the cluster manager to cancel the extra pending requests
+
+          // We lower the target number of executors but don't actively kill any yet.  Killing is
+          // controlled separately by an idle timeout.  It's still helpful to reduce
+          // the target number in case an executor just happens to get lost (eg., bad hardware,
+          // or the cluster manager preempts it) -- in that case, there is no point in trying
+          // to immediately  get a new executor, since we wouldn't even use it yet.
+          decrementExecutorsFromTarget(maxNeeded, rpId, updatesNeeded)
+        } else if (addTime != NOT_SET && now >= addTime) {
+          addExecutorsToTarget(maxNeeded, rpId, updatesNeeded)
+        }
+      }
+      doUpdateRequest(updatesNeeded.toMap, now)
+    }
+  }
+
+  private def addExecutorsToTarget(
+      maxNeeded: Int,
+      rpId: Int,
+      updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = {
+    updateTargetExecs(addExecutors, maxNeeded, rpId, updatesNeeded)
+  }
+
+  private def decrementExecutorsFromTarget(
+      maxNeeded: Int,
+      rpId: Int,
+      updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = {
+    updateTargetExecs(decrementExecutors, maxNeeded, rpId, updatesNeeded)
+  }
+
+  private def updateTargetExecs(
+      updateTargetFn: (Int, Int) => Int,
+      maxNeeded: Int,
+      rpId: Int,
+      updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = {
+    val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId)
+    // update the target number (add or remove)
+    val delta = updateTargetFn(maxNeeded, rpId)
+    if (delta != 0) {
+      updatesNeeded(rpId) = ExecutorAllocationManager.TargetNumUpdates(delta, oldNumExecutorsTarget)
+    }
+    delta
+  }
+
+  private def doUpdateRequest(
+      updates: Map[Int, ExecutorAllocationManager.TargetNumUpdates],
+      now: Long): Int = {
+    // Only call cluster manager if target has changed.
+    if (updates.size > 0) {
+      val requestAcknowledged = try {
+        logDebug("requesting updates: " + updates)
+        testing ||
+          client.requestTotalExecutors(
+            numExecutorsTargetPerResourceProfileId.toMap,
+            numLocalityAwareTasksPerResourceProfileId.toMap,
+            rpIdToHostToLocalTaskCount)
+      } catch {
+        case NonFatal(e) =>
+          // Use INFO level so the error it doesn't show up by default in shells.
+          // Errors here are more commonly caused by YARN AM restarts, which is a recoverable
+          // issue, and generate a lot of noisy output.
+          logInfo("Error reaching cluster manager.", e)
+          false
+      }
+      if (requestAcknowledged) {
+        // have to go through all resource profiles that changed
+        var totalDelta = 0
+        updates.foreach { case (rpId, targetNum) =>
+          val delta = targetNum.delta
+          totalDelta += delta
+          if (delta > 0) {
+            val executorsString = "executor" + { if (delta > 1) "s" else "" }
+            logInfo(s"Requesting $delta new $executorsString because tasks are backlogged " +
+              s"(new desired total will be ${numExecutorsTargetPerResourceProfileId(rpId)} " +
+              s"for resource profile id: ${rpId})")
+            numExecutorsToAddPerResourceProfileId(rpId) =
+              if (delta == numExecutorsToAddPerResourceProfileId(rpId)) {
+                numExecutorsToAddPerResourceProfileId(rpId) * 2
+              } else {
+                1
+              }
+            logDebug(s"Starting timer to add more executors (to " +
+              s"expire in $sustainedSchedulerBacklogTimeoutS seconds)")
+            addTime = now + TimeUnit.SECONDS.toNanos(sustainedSchedulerBacklogTimeoutS)
+          } else {
+            logDebug(s"Lowering target number of executors to" +
+              s" ${numExecutorsTargetPerResourceProfileId(rpId)} (previously " +
+              s"$targetNum.oldNumExecutorsTarget for resource profile id: ${rpId}) " +
+              "because not all requested executors " +
+              "are actually needed")
+          }
+        }
+        totalDelta
+      } else {
+        // request was for all profiles so we have to go through all to reset to old num
+        updates.foreach { case (rpId, targetNum) =>
+          logWarning("Unable to reach the cluster manager to request more executors!")
+          numExecutorsTargetPerResourceProfileId(rpId) = targetNum.oldNumExecutorsTarget
+        }
+        0
       }
-      numExecutorsTarget - oldNumExecutorsTarget
-    } else if (addTime != NOT_SET && now >= addTime) {
-      val delta = addExecutors(maxNeeded)
-      logDebug(s"Starting timer to add more executors (to " +
-        s"expire in $sustainedSchedulerBacklogTimeoutS seconds)")
-      addTime = now + TimeUnit.SECONDS.toNanos(sustainedSchedulerBacklogTimeoutS)
-      delta
     } else {
+      logDebug("No change in number of executors")
       0
     }
   }
 
+  private def decrementExecutors(maxNeeded: Int, rpId: Int): Int = {
+    val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId)
+    numExecutorsTargetPerResourceProfileId(rpId) = math.max(maxNeeded, minNumExecutors)
+    numExecutorsToAddPerResourceProfileId(rpId) = 1
+    numExecutorsTargetPerResourceProfileId(rpId) - oldNumExecutorsTarget
+  }
+
   /**
-   * Request a number of executors from the cluster manager.
+   * Update the target number of executors and figure out how many to add.
    * If the cap on the number of executors is reached, give up and reset the
    * number of executors to add next round instead of continuing to double it.
    *
    * @param maxNumExecutorsNeeded the maximum number of executors all currently running or pending
    *                              tasks could fill
+   * @param rpId                  the ResourceProfile id of the executors
    * @return the number of additional executors actually requested.
    */
-  private def addExecutors(maxNumExecutorsNeeded: Int): Int = {
+  private def addExecutors(maxNumExecutorsNeeded: Int, rpId: Int): Int = {
+    val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId)
     // Do not request more executors if it would put our target over the upper bound
-    if (numExecutorsTarget >= maxNumExecutors) {
-      logDebug(s"Not adding executors because our current target total " +
-        s"is already $numExecutorsTarget (limit $maxNumExecutors)")
-      numExecutorsToAdd = 1
+    // this is doing a max check per ResourceProfile
+    if (oldNumExecutorsTarget >= maxNumExecutors) {
+      logDebug("Not adding executors because our current target total " +
+        s"is already ${oldNumExecutorsTarget} (limit $maxNumExecutors)")
+      numExecutorsToAddPerResourceProfileId(rpId) = 1
       return 0
     }
-
-    val oldNumExecutorsTarget = numExecutorsTarget
     // There's no point in wasting time ramping up to the number of executors we already have, so
     // make sure our target is at least as much as our current allocation:
-    numExecutorsTarget = math.max(numExecutorsTarget, executorMonitor.executorCount)
+    var numExecutorsTarget = math.max(numExecutorsTargetPerResourceProfileId(rpId),
+        executorMonitor.executorCountWithResourceProfile(rpId))
     // Boost our target with the number to add for this round:
-    numExecutorsTarget += numExecutorsToAdd
+    numExecutorsTarget += numExecutorsToAddPerResourceProfileId(rpId)
     // Ensure that our target doesn't exceed what we need at the present moment:
     numExecutorsTarget = math.min(numExecutorsTarget, maxNumExecutorsNeeded)
     // Ensure that our target fits within configured bounds:
     numExecutorsTarget = math.max(math.min(numExecutorsTarget, maxNumExecutors), minNumExecutors)
-
     val delta = numExecutorsTarget - oldNumExecutorsTarget
+    numExecutorsTargetPerResourceProfileId(rpId) = numExecutorsTarget
 
     // If our target has not changed, do not send a message
     // to the cluster manager and reset our exponential growth
     if (delta == 0) {
-      numExecutorsToAdd = 1
-      return 0
-    }
-
-    val addRequestAcknowledged = try {
-      testing ||
-        client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
-    } catch {
-      case NonFatal(e) =>
-        // Use INFO level so the error it doesn't show up by default in shells. Errors here are more
-        // commonly caused by YARN AM restarts, which is a recoverable issue, and generate a lot of
-        // noisy output.
-        logInfo("Error reaching cluster manager.", e)
-        false
-    }
-    if (addRequestAcknowledged) {
-      val executorsString = "executor" + { if (delta > 1) "s" else "" }
-      logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" +
-        s" (new desired total will be $numExecutorsTarget)")
-      numExecutorsToAdd = if (delta == numExecutorsToAdd) {
-        numExecutorsToAdd * 2
-      } else {
-        1
-      }
-      delta
-    } else {
-      logWarning(
-        s"Unable to reach the cluster manager to request $numExecutorsTarget total executors!")
-      numExecutorsTarget = oldNumExecutorsTarget
-      0
+      numExecutorsToAddPerResourceProfileId(rpId) = 1
     }
+    delta
   }
 
   /**
    * Request the cluster manager to remove the given executors.
    * Returns the list of executors which are removed.
    */
-  private def removeExecutors(executors: Seq[String]): Seq[String] = synchronized {
+  private def removeExecutors(executors: Seq[(String, Int)]): Seq[String] = synchronized {
     val executorIdsToBeRemoved = new ArrayBuffer[String]
-
     logDebug(s"Request to remove executorIds: ${executors.mkString(", ")}")
-    val numExistingExecutors = executorMonitor.executorCount - executorMonitor.pendingRemovalCount
-
-    var newExecutorTotal = numExistingExecutors
-    executors.foreach { executorIdToBeRemoved =>
-      if (newExecutorTotal - 1 < minNumExecutors) {
-        logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " +
-          s"$newExecutorTotal executor(s) left (minimum number of executor limit $minNumExecutors)")
-      } else if (newExecutorTotal - 1 < numExecutorsTarget) {
-        logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " +
-          s"$newExecutorTotal executor(s) left (number of executor target $numExecutorsTarget)")
+    val numExecutorsTotalPerRpId = mutable.Map[Int, Int]()
+    executors.foreach { case (executorIdToBeRemoved, rpId) =>
+      if (rpId == UNKNOWN_RESOURCE_PROFILE_ID) {
+        if (testing) {
+          throw new SparkException("ResourceProfile Id was UNKNOWN, this is not expected")
+        }
+        logWarning(s"Not removing executor $executorIdsToBeRemoved because the " +
+          "ResourceProfile was UNKNOWN!")
       } else {
-        executorIdsToBeRemoved += executorIdToBeRemoved
-        newExecutorTotal -= 1
+        // get the running total as we remove or initialize it to the count - pendingRemoval
+        val newExecutorTotal = numExecutorsTotalPerRpId.getOrElseUpdate(rpId,
+          (executorMonitor.executorCountWithResourceProfile(rpId) -
+            executorMonitor.pendingRemovalCountPerResourceProfileId(rpId)))
+        if (newExecutorTotal - 1 < minNumExecutors) {
+          logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " +
+            s"are only $newExecutorTotal executor(s) left (minimum number of executor limit " +
+            s"$minNumExecutors)")
+        } else if (newExecutorTotal - 1 < numExecutorsTargetPerResourceProfileId(rpId)) {
+          logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " +
+            s"are only $newExecutorTotal executor(s) left (number of executor " +
+            s"target ${numExecutorsTargetPerResourceProfileId(rpId)})")
+        } else {
+          executorIdsToBeRemoved += executorIdToBeRemoved
+          numExecutorsTotalPerRpId(rpId) -= 1
+        }
       }
     }
 
@@ -457,14 +556,15 @@ private[spark] class ExecutorAllocationManager(
 
     // [SPARK-21834] killExecutors api reduces the target number of executors.
     // So we need to update the target with desired value.
-    client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    client.requestTotalExecutors(
+      numExecutorsTargetPerResourceProfileId.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap,
+      rpIdToHostToLocalTaskCount)
+
     // reset the newExecutorTotal to the existing number of executors
-    newExecutorTotal = numExistingExecutors
     if (testing || executorsRemoved.nonEmpty) {
-      newExecutorTotal -= executorsRemoved.size
       executorMonitor.executorsKilled(executorsRemoved)
-      logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout." +
-        s"(new desired total will be $newExecutorTotal)")
+      logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.")
       executorsRemoved
     } else {
       logWarning(s"Unable to reach the cluster manager to kill executor/s " +
@@ -493,7 +593,7 @@ private[spark] class ExecutorAllocationManager(
   private def onSchedulerQueueEmpty(): Unit = synchronized {
     logDebug("Clearing timer to add executors because there are no more pending tasks")
     addTime = NOT_SET
-    numExecutorsToAdd = 1
+    numExecutorsToAddPerResourceProfileId.transform { case (_, _) => 1 }
   }
 
   private case class StageAttempt(stageId: Int, stageAttemptId: Int) {
@@ -519,12 +619,16 @@ private[spark] class ExecutorAllocationManager(
     private val stageAttemptToSpeculativeTaskIndices =
       new mutable.HashMap[StageAttempt, mutable.HashSet[Int]]
 
+    private val resourceProfileIdToStageAttempt =
+      new mutable.HashMap[Int, mutable.Set[StageAttempt]]
+
     // stageAttempt to tuple (the number of task with locality preferences, a map where each pair
-    // is a node and the number of tasks that would like to be scheduled on that node) map,
+    // is a node and the number of tasks that would like to be scheduled on that node, and
+    // the resource profile id) map,
     // maintain the executor placement hints for each stageAttempt used by resource framework
     // to better place the executors.
     private val stageAttemptToExecutorPlacementHints =
-      new mutable.HashMap[StageAttempt, (Int, Map[String, Int])]
+      new mutable.HashMap[StageAttempt, (Int, Map[String, Int], Int)]
 
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
       initializing = false
@@ -535,6 +639,13 @@ private[spark] class ExecutorAllocationManager(
       allocationManager.synchronized {
         stageAttemptToNumTasks(stageAttempt) = numTasks
         allocationManager.onSchedulerBacklogged()
+        // need to keep stage task requirements to ask for the right containers
+        val profId = stageSubmitted.stageInfo.resourceProfileId
+        logDebug(s"Stage resource profile id is: $profId with numTasks: $numTasks")
+        resourceProfileIdToStageAttempt.getOrElseUpdate(
+          profId, new mutable.HashSet[StageAttempt]) += stageAttempt
+        numExecutorsToAddPerResourceProfileId.getOrElseUpdate(profId, 1)
+        numExecutorsTargetPerResourceProfileId.getOrElseUpdate(profId, initialNumExecutors)
 
         // Compute the number of tasks requested by the stage on each host
         var numTasksPending = 0
@@ -549,7 +660,7 @@ private[spark] class ExecutorAllocationManager(
           }
         }
         stageAttemptToExecutorPlacementHints.put(stageAttempt,
-          (numTasksPending, hostToLocalTaskCountPerStage.toMap))
+          (numTasksPending, hostToLocalTaskCountPerStage.toMap, profId))
 
         // Update the executor placement hints
         updateExecutorPlacementHints()
@@ -561,7 +672,7 @@ private[spark] class ExecutorAllocationManager(
       val stageAttemptId = stageCompleted.stageInfo.attemptNumber()
       val stageAttempt = StageAttempt(stageId, stageAttemptId)
       allocationManager.synchronized {
-        // do NOT remove stageAttempt from stageAttemptToNumRunningTasks,
+        // do NOT remove stageAttempt from stageAttemptToNumRunningTask
         // because the attempt may still have running tasks,
         // even after another attempt for the stage is submitted.
         stageAttemptToNumTasks -= stageAttempt
@@ -597,7 +708,7 @@ private[spark] class ExecutorAllocationManager(
           stageAttemptToTaskIndices.getOrElseUpdate(stageAttempt,
             new mutable.HashSet[Int]) += taskIndex
         }
-        if (totalPendingTasks() == 0) {
+        if (!hasPendingTasks) {
           allocationManager.onSchedulerQueueEmpty()
         }
       }
@@ -613,9 +724,22 @@ private[spark] class ExecutorAllocationManager(
           stageAttemptToNumRunningTask(stageAttempt) -= 1
           if (stageAttemptToNumRunningTask(stageAttempt) == 0) {
             stageAttemptToNumRunningTask -= stageAttempt
+            if (!stageAttemptToNumTasks.contains(stageAttempt)) {
+              val rpForStage = resourceProfileIdToStageAttempt.filter { case (k, v) =>
+                v.contains(stageAttempt)
+              }.keys
+              if (rpForStage.size == 1) {
+                // be careful about the removal from here due to late tasks, make sure stage is
+                // really complete and no tasks left
+                resourceProfileIdToStageAttempt(rpForStage.head) -= stageAttempt
+              } else {
+                logWarning(s"Should have exactly one resource profile for stage $stageAttempt," +
+                  s" but have $rpForStage")
+              }
+            }
+
           }
         }
-
         if (taskEnd.taskInfo.speculative) {
           stageAttemptToSpeculativeTaskIndices.get(stageAttempt).foreach {_.remove{taskIndex}}
           stageAttemptToNumSpeculativeTasks(stageAttempt) -= 1
@@ -624,7 +748,7 @@ private[spark] class ExecutorAllocationManager(
         taskEnd.reason match {
           case Success | _: TaskKilled =>
           case _ =>
-            if (totalPendingTasks() == 0) {
+            if (!hasPendingTasks) {
               // If the task failed (not intentionally killed), we expect it to be resubmitted
               // later. To ensure we have enough resources to run the resubmitted task, we need to
               // mark the scheduler as backlogged again if it's not already marked as such
@@ -661,20 +785,46 @@ private[spark] class ExecutorAllocationManager(
      *
      * Note: This is not thread-safe without the caller owning the `allocationManager` lock.
      */
-    def pendingTasks(): Int = {
-      stageAttemptToNumTasks.map { case (stageAttempt, numTasks) =>
-        numTasks - stageAttemptToTaskIndices.get(stageAttempt).map(_.size).getOrElse(0)
-      }.sum
+    def pendingTasksPerResourceProfile(rpId: Int): Int = {
+      val attempts = resourceProfileIdToStageAttempt.getOrElse(rpId, Set.empty).toSeq
+      attempts.map(attempt => getPendingTaskSum(attempt)).sum
     }
 
-    def pendingSpeculativeTasks(): Int = {
-      stageAttemptToNumSpeculativeTasks.map { case (stageAttempt, numTasks) =>
-        numTasks - stageAttemptToSpeculativeTaskIndices.get(stageAttempt).map(_.size).getOrElse(0)
-      }.sum
+    def hasPendingRegularTasks: Boolean = {
+      val attemptSets = resourceProfileIdToStageAttempt.values
+      attemptSets.exists(attempts => attempts.exists(getPendingTaskSum(_) > 0))
+    }
+
+    private def getPendingTaskSum(attempt: StageAttempt): Int = {
+      val numTotalTasks = stageAttemptToNumTasks.getOrElse(attempt, 0)
+      val numRunning = stageAttemptToTaskIndices.get(attempt).map(_.size).getOrElse(0)
+      numTotalTasks - numRunning
     }
 
-    def totalPendingTasks(): Int = {
-      pendingTasks + pendingSpeculativeTasks
+    def pendingSpeculativeTasksPerResourceProfile(rp: Int): Int = {
+      val attempts = resourceProfileIdToStageAttempt.getOrElse(rp, Set.empty).toSeq
+      attempts.map(attempt => getPendingSpeculativeTaskSum(attempt)).sum
+    }
+
+    def hasPendingSpeculativeTasks: Boolean = {
+      val attemptSets = resourceProfileIdToStageAttempt.values
+      attemptSets.exists { attempts =>
+        attempts.exists(getPendingSpeculativeTaskSum(_) > 0)
+      }
+    }
+
+    private def getPendingSpeculativeTaskSum(attempt: StageAttempt): Int = {
+      val numTotalTasks = stageAttemptToNumSpeculativeTasks.getOrElse(attempt, 0)
+      val numRunning = stageAttemptToSpeculativeTaskIndices.get(attempt).map(_.size).getOrElse(0)
+      numTotalTasks - numRunning
+    }
+
+    def hasPendingTasks: Boolean = {
+      hasPendingSpeculativeTasks || hasPendingRegularTasks
+    }
+
+    def totalPendingTasksPerResourceProfile(rp: Int): Int = {
+      pendingTasksPerResourceProfile(rp) + pendingSpeculativeTasksPerResourceProfile(rp)
     }
 
     /**
@@ -685,6 +835,14 @@ private[spark] class ExecutorAllocationManager(
       stageAttemptToNumRunningTask.values.sum
     }
 
+    def totalRunningTasksPerResourceProfile(rp: Int): Int = {
+      val attempts = resourceProfileIdToStageAttempt.getOrElse(rp, Set.empty).toSeq
+      // attempts is a Set, change to Seq so we keep all values
+      attempts.map { attempt =>
+        stageAttemptToNumRunningTask.getOrElseUpdate(attempt, 0)
+      }.sum
+    }
+
     /**
      * Update the Executor placement hints (the number of tasks with locality preferences,
      * a map where each pair is a node and the number of tasks that would like to be scheduled
@@ -694,18 +852,27 @@ private[spark] class ExecutorAllocationManager(
      * granularity within stages.
      */
     def updateExecutorPlacementHints(): Unit = {
-      var localityAwareTasks = 0
-      val localityToCount = new mutable.HashMap[String, Int]()
-      stageAttemptToExecutorPlacementHints.values.foreach { case (numTasksPending, localities) =>
-        localityAwareTasks += numTasksPending
-        localities.foreach { case (hostname, count) =>
-          val updatedCount = localityToCount.getOrElse(hostname, 0) + count
-          localityToCount(hostname) = updatedCount
-        }
+      val localityAwareTasksPerResourceProfileId = new mutable.HashMap[Int, Int]
+
+      // ResourceProfile id => map[host, count]
+      val rplocalityToCount = new mutable.HashMap[Int, mutable.HashMap[String, Int]]()
+      stageAttemptToExecutorPlacementHints.values.foreach {
+        case (numTasksPending, localities, rpId) =>
+          val rpNumPending =
+            localityAwareTasksPerResourceProfileId.getOrElse(rpId, 0)
+          localityAwareTasksPerResourceProfileId(rpId) = rpNumPending + numTasksPending
+          localities.foreach { case (hostname, count) =>
+            val rpBasedHostToCount =
+              rplocalityToCount.getOrElseUpdate(rpId, new mutable.HashMap[String, Int])
+            val newUpdated = rpBasedHostToCount.getOrElse(hostname, 0) + count
+            rpBasedHostToCount(hostname) = newUpdated
+          }
       }
 
-      allocationManager.localityAwareTasks = localityAwareTasks
-      allocationManager.hostToLocalTaskCount = localityToCount.toMap
+      allocationManager.numLocalityAwareTasksPerResourceProfileId =
+        localityAwareTasksPerResourceProfileId
+      allocationManager.rpIdToHostToLocalTaskCount =
+        rplocalityToCount.map { case (k, v) => (k, v.toMap)}.toMap
     }
   }
 
@@ -726,14 +893,22 @@ private[spark] class ExecutorAllocationManager(
       })
     }
 
-    registerGauge("numberExecutorsToAdd", numExecutorsToAdd, 0)
+    // The metrics are going to return the sum for all the different ResourceProfiles.
+    registerGauge("numberExecutorsToAdd",
+      numExecutorsToAddPerResourceProfileId.values.sum, 0)
     registerGauge("numberExecutorsPendingToRemove", executorMonitor.pendingRemovalCount, 0)
     registerGauge("numberAllExecutors", executorMonitor.executorCount, 0)
-    registerGauge("numberTargetExecutors", numExecutorsTarget, 0)
-    registerGauge("numberMaxNeededExecutors", maxNumExecutorsNeeded(), 0)
+    registerGauge("numberTargetExecutors",
+      numExecutorsTargetPerResourceProfileId.values.sum, 0)
+    registerGauge("numberMaxNeededExecutors", numExecutorsTargetPerResourceProfileId.keys
+        .map(maxNumExecutorsNeededPerResourceProfile(_)).sum, 0)
   }
 }
 
 private object ExecutorAllocationManager {
   val NOT_SET = Long.MaxValue
+
+  // helper case class for requesting executors, here to be visible for testing
+  private[spark] case class TargetNumUpdates(delta: Int, oldNumExecutorsTarget: Int)
+
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 91188d58f4201..a47136ea36736 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -25,6 +25,7 @@ import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReferenc
 
 import scala.collection.JavaConverters._
 import scala.collection.Map
+import scala.collection.immutable
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 import scala.reflect.{classTag, ClassTag}
@@ -53,7 +54,7 @@ import org.apache.spark.io.CompressionCodec
 import org.apache.spark.metrics.source.JVMCPUSource
 import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd._
-import org.apache.spark.resource.{ResourceID, ResourceInformation}
+import org.apache.spark.resource._
 import org.apache.spark.resource.ResourceUtils._
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler._
@@ -219,9 +220,10 @@ class SparkContext(config: SparkConf) extends Logging {
   private var _shutdownHookRef: AnyRef = _
   private var _statusStore: AppStatusStore = _
   private var _heartbeater: Heartbeater = _
-  private var _resources: scala.collection.immutable.Map[String, ResourceInformation] = _
+  private var _resources: immutable.Map[String, ResourceInformation] = _
   private var _shuffleDriverComponents: ShuffleDriverComponents = _
   private var _plugins: Option[PluginContainer] = None
+  private var _resourceProfileManager: ResourceProfileManager = _
 
   /* ------------------------------------------------------------------------------------- *
    | Accessors and public fields. These provide access to the internal state of the        |
@@ -343,6 +345,8 @@ class SparkContext(config: SparkConf) extends Logging {
   private[spark] def executorAllocationManager: Option[ExecutorAllocationManager] =
     _executorAllocationManager
 
+  private[spark] def resourceProfileManager: ResourceProfileManager = _resourceProfileManager
+
   private[spark] def cleaner: Option[ContextCleaner] = _cleaner
 
   private[spark] var checkpointDir: Option[String] = None
@@ -451,6 +455,7 @@ class SparkContext(config: SparkConf) extends Logging {
     }
 
     _listenerBus = new LiveListenerBus(_conf)
+    _resourceProfileManager = new ResourceProfileManager(_conf)
 
     // Initialize the app status store and listener before SparkEnv is created so that it gets
     // all events.
@@ -611,7 +616,7 @@ class SparkContext(config: SparkConf) extends Logging {
           case b: ExecutorAllocationClient =>
             Some(new ExecutorAllocationManager(
               schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
-              cleaner = cleaner))
+              cleaner = cleaner, resourceProfileManager = resourceProfileManager))
           case _ =>
             None
         }
@@ -1622,7 +1627,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Update the cluster manager on our scheduling needs. Three bits of information are included
-   * to help it make decisions.
+   * to help it make decisions. This applies to the default ResourceProfile.
    * @param numExecutors The total number of executors we'd like to have. The cluster manager
    *                     shouldn't kill any running executor to reach this number, but,
    *                     if all existing executors were to die, this is the number of executors
@@ -1638,11 +1643,16 @@ class SparkContext(config: SparkConf) extends Logging {
   def requestTotalExecutors(
       numExecutors: Int,
       localityAwareTasks: Int,
-      hostToLocalTaskCount: scala.collection.immutable.Map[String, Int]
+      hostToLocalTaskCount: immutable.Map[String, Int]
     ): Boolean = {
     schedulerBackend match {
       case b: ExecutorAllocationClient =>
-        b.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount)
+        // this is being applied to the default resource profile, would need to add api to support
+        // others
+        val defaultProfId = resourceProfileManager.defaultResourceProfile.id
+        b.requestTotalExecutors(immutable.Map(defaultProfId-> numExecutors),
+          immutable.Map(localityAwareTasks -> defaultProfId),
+          immutable.Map(defaultProfId -> hostToLocalTaskCount))
       case _ =>
         logWarning("Requesting executors is not supported by current scheduler.")
         false
@@ -2036,6 +2046,7 @@ class SparkContext(config: SparkConf) extends Logging {
     // Clear this `InheritableThreadLocal`, or it will still be inherited in child threads even this
     // `SparkContext` is stopped.
     localProperties.remove()
+    ResourceProfile.clearDefaultProfile()
     // Unset YARN mode system env variable, to allow switching between cluster types.
     SparkContext.clearActiveContext()
     logInfo("Successfully stopped SparkContext")
@@ -2771,109 +2782,34 @@ object SparkContext extends Logging {
     // When running locally, don't try to re-execute tasks on failure.
     val MAX_LOCAL_TASK_FAILURES = 1
 
-    // Ensure that executor's resources satisfies one or more tasks requirement.
-    def checkResourcesPerTask(clusterMode: Boolean, executorCores: Option[Int]): Unit = {
+    // Ensure that default executor's resources satisfies one or more tasks requirement.
+    // This function is for cluster managers that don't set the executor cores config, for
+    // others its checked in ResourceProfile.
+    def checkResourcesPerTask(executorCores: Int): Unit = {
       val taskCores = sc.conf.get(CPUS_PER_TASK)
-      val execCores = if (clusterMode) {
-        executorCores.getOrElse(sc.conf.get(EXECUTOR_CORES))
-      } else {
-        executorCores.get
-      }
-      // some cluster managers don't set the EXECUTOR_CORES config by default (standalone
-      // and mesos coarse grained), so we can't rely on that config for those.
-      val shouldCheckExecCores = executorCores.isDefined || sc.conf.contains(EXECUTOR_CORES) ||
-        (master.equalsIgnoreCase("yarn") || master.startsWith("k8s"))
-
-      // Number of cores per executor must meet at least one task requirement.
-      if (shouldCheckExecCores && execCores < taskCores) {
-        throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " +
-          s"the task config: ${CPUS_PER_TASK.key} = $taskCores when run on $master.")
-      }
-
-      // Calculate the max slots each executor can provide based on resources available on each
-      // executor and resources required by each task.
-      val taskResourceRequirements = parseResourceRequirements(sc.conf, SPARK_TASK_PREFIX)
-      val executorResourcesAndAmounts = parseAllResourceRequests(sc.conf, SPARK_EXECUTOR_PREFIX)
-          .map(request => (request.id.resourceName, request.amount)).toMap
-
-      var (numSlots, limitingResourceName) = if (shouldCheckExecCores) {
-        (execCores / taskCores, "CPU")
-      } else {
-        (-1, "")
-      }
-
-      taskResourceRequirements.foreach { taskReq =>
-        // Make sure the executor resources were specified through config.
-        val execAmount = executorResourcesAndAmounts.getOrElse(taskReq.resourceName,
-          throw new SparkException("The executor resource config: " +
-            new ResourceID(SPARK_EXECUTOR_PREFIX, taskReq.resourceName).amountConf +
-            " needs to be specified since a task requirement config: " +
-            new ResourceID(SPARK_TASK_PREFIX, taskReq.resourceName).amountConf +
-            " was specified")
-        )
-        // Make sure the executor resources are large enough to launch at least one task.
-        if (execAmount < taskReq.amount) {
-          throw new SparkException("The executor resource config: " +
-            new ResourceID(SPARK_EXECUTOR_PREFIX, taskReq.resourceName).amountConf +
-            s" = $execAmount has to be >= the requested amount in task resource config: " +
-            new ResourceID(SPARK_TASK_PREFIX, taskReq.resourceName).amountConf +
-            s" = ${taskReq.amount}")
-        }
-        // Compare and update the max slots each executor can provide.
-        // If the configured amount per task was < 1.0, a task is subdividing
-        // executor resources. If the amount per task was > 1.0, the task wants
-        // multiple executor resources.
-        val resourceNumSlots = Math.floor(execAmount * taskReq.numParts / taskReq.amount).toInt
-        if (resourceNumSlots < numSlots) {
-          if (shouldCheckExecCores) {
-            throw new IllegalArgumentException("The number of slots on an executor has to be " +
-              "limited by the number of cores, otherwise you waste resources and " +
-              "dynamic allocation doesn't work properly. Your configuration has " +
-              s"core/task cpu slots = ${numSlots} and " +
-              s"${taskReq.resourceName} = ${resourceNumSlots}. " +
-              "Please adjust your configuration so that all resources require same number " +
-              "of executor slots.")
-          }
-          numSlots = resourceNumSlots
-          limitingResourceName = taskReq.resourceName
-        }
-      }
-      if(!shouldCheckExecCores && Utils.isDynamicAllocationEnabled(sc.conf)) {
-        // if we can't rely on the executor cores config throw a warning for user
-        logWarning("Please ensure that the number of slots available on your " +
-          "executors is limited by the number of cores to task cpus and not another " +
-          "custom resource. If cores is not the limiting resource then dynamic " +
-          "allocation will not work properly!")
-      }
-      // warn if we would waste any resources due to another resource limiting the number of
-      // slots on an executor
-      taskResourceRequirements.foreach { taskReq =>
-        val execAmount = executorResourcesAndAmounts(taskReq.resourceName)
-        if ((numSlots * taskReq.amount / taskReq.numParts) < execAmount) {
-          val taskReqStr = if (taskReq.numParts > 1) {
-            s"${taskReq.amount}/${taskReq.numParts}"
-          } else {
-            s"${taskReq.amount}"
-          }
-          val resourceNumSlots = Math.floor(execAmount * taskReq.numParts / taskReq.amount).toInt
-          val message = s"The configuration of resource: ${taskReq.resourceName} " +
-            s"(exec = ${execAmount}, task = ${taskReqStr}, " +
-            s"runnable tasks = ${resourceNumSlots}) will " +
-            s"result in wasted resources due to resource ${limitingResourceName} limiting the " +
-            s"number of runnable tasks per executor to: ${numSlots}. Please adjust " +
-            s"your configuration."
-          if (Utils.isTesting) {
-            throw new SparkException(message)
-          } else {
-            logWarning(message)
-          }
-        }
+      validateTaskCpusLargeEnough(executorCores, taskCores)
+      val defaultProf = sc.resourceProfileManager.defaultResourceProfile
+      // TODO - this is temporary until all of stage level scheduling feature is integrated,
+      // fail if any other resource limiting due to dynamic allocation and scheduler using
+      // slots based on cores
+      val cpuSlots = executorCores/taskCores
+      val limitingResource = defaultProf.limitingResource(sc.conf)
+      if (limitingResource.nonEmpty && !limitingResource.equals(ResourceProfile.CPUS) &&
+        defaultProf.maxTasksPerExecutor(sc.conf) < cpuSlots) {
+        throw new IllegalArgumentException("The number of slots on an executor has to be " +
+          "limited by the number of cores, otherwise you waste resources and " +
+          "dynamic allocation doesn't work properly. Your configuration has " +
+          s"core/task cpu slots = ${cpuSlots} and " +
+          s"${limitingResource} = " +
+          s"${defaultProf.maxTasksPerExecutor(sc.conf)}. Please adjust your configuration " +
+          "so that all resources require same number of executor slots.")
       }
+      ResourceUtils.warnOnWastedResources(defaultProf, sc.conf, Some(executorCores))
     }
 
     master match {
       case "local" =>
-        checkResourcesPerTask(clusterMode = false, Some(1))
+        checkResourcesPerTask(1)
         val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)
         scheduler.initialize(backend)
@@ -2886,7 +2822,7 @@ object SparkContext extends Logging {
         if (threadCount <= 0) {
           throw new SparkException(s"Asked to run locally with $threadCount threads")
         }
-        checkResourcesPerTask(clusterMode = false, Some(threadCount))
+        checkResourcesPerTask(threadCount)
         val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
         scheduler.initialize(backend)
@@ -2897,14 +2833,13 @@ object SparkContext extends Logging {
         // local[*, M] means the number of cores on the computer with M failures
         // local[N, M] means exactly N threads with M failures
         val threadCount = if (threads == "*") localCpuCount else threads.toInt
-        checkResourcesPerTask(clusterMode = false, Some(threadCount))
+        checkResourcesPerTask(threadCount)
         val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
         scheduler.initialize(backend)
         (backend, scheduler)
 
       case SPARK_REGEX(sparkUrl) =>
-        checkResourcesPerTask(clusterMode = true, None)
         val scheduler = new TaskSchedulerImpl(sc)
         val masterUrls = sparkUrl.split(",").map("spark://" + _)
         val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
@@ -2912,7 +2847,7 @@ object SparkContext extends Logging {
         (backend, scheduler)
 
       case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
-        checkResourcesPerTask(clusterMode = true, Some(coresPerSlave.toInt))
+        checkResourcesPerTask(coresPerSlave.toInt)
         // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
         val memoryPerSlaveInt = memoryPerSlave.toInt
         if (sc.executorMemory > memoryPerSlaveInt) {
@@ -2941,7 +2876,6 @@ object SparkContext extends Logging {
         (backend, scheduler)
 
       case masterUrl =>
-        checkResourcesPerTask(clusterMode = true, None)
         val cm = getClusterManager(masterUrl) match {
           case Some(clusterMgr) => clusterMgr
           case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala
index 21660ab3a9512..51df73ebde07d 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala
@@ -53,4 +53,13 @@ private[spark] object Tests {
   val TEST_N_CORES_EXECUTOR = ConfigBuilder("spark.testing.nCoresPerExecutor")
     .intConf
     .createWithDefault(2)
+
+  val RESOURCES_WARNING_TESTING =
+    ConfigBuilder("spark.resources.warnings.testing").booleanConf.createWithDefault(false)
+
+  val RESOURCE_PROFILE_MANAGER_TESTING =
+    ConfigBuilder("spark.testing.resourceProfileManager")
+      .booleanConf
+      .createWithDefault(false)
+
 }
diff --git a/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala
index d345674d6635c..d4c29f9a70c44 100644
--- a/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala
@@ -109,7 +109,7 @@ private[spark] class ExecutorResourceRequests() extends Serializable {
       discoveryScript: String = "",
       vendor: String = ""): this.type = {
     // a bit weird but for Java api use empty string as meaning None because empty
-    // string is otherwise invalid for those paramters anyway
+    // string is otherwise invalid for those parameters anyway
     val req = new ExecutorResourceRequest(resourceName, amount, discoveryScript, vendor)
     _executorResources.put(resourceName, req)
     this
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
index 14019d27fc2e6..03dcf5e317798 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
@@ -22,12 +22,14 @@ import java.util.concurrent.atomic.AtomicInteger
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.annotation.Evolving
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY
+import org.apache.spark.util.Utils
 
 /**
  * Resource profile to associate with an RDD. A ResourceProfile allows the user to
@@ -42,6 +44,13 @@ class ResourceProfile(
 
   // _id is only a var for testing purposes
   private var _id = ResourceProfile.getNextProfileId
+  // This is used for any resources that use fractional amounts, the key is the resource name
+  // and the value is the number of tasks that can share a resource address. For example,
+  // if the user says task gpu amount is 0.5, that results in 2 tasks per resource address.
+  private var _executorResourceSlotsPerAddr: Option[Map[String, Int]] = None
+  private var _limitingResource: Option[String] = None
+  private var _maxTasksPerExecutor: Option[Int] = None
+  private var _coresLimitKnown: Boolean = false
 
   def id: Int = _id
 
@@ -67,6 +76,138 @@ class ResourceProfile(
     taskResources.get(ResourceProfile.CPUS).map(_.amount.toInt)
   }
 
+  private[spark] def getNumSlotsPerAddress(resource: String, sparkConf: SparkConf): Int = {
+    _executorResourceSlotsPerAddr.getOrElse {
+      calculateTasksAndLimitingResource(sparkConf)
+    }
+    _executorResourceSlotsPerAddr.get.getOrElse(resource,
+      throw new SparkException(s"Resource $resource doesn't exist in profile id: $id"))
+  }
+
+  // Maximum tasks you could put on an executor with this profile based on the limiting resource.
+  // If the executor cores config is not present this value is based on the other resources
+  // available or 1 if no other resources. You need to check the isCoresLimitKnown to
+  // calculate proper value.
+  private[spark] def maxTasksPerExecutor(sparkConf: SparkConf): Int = {
+    _maxTasksPerExecutor.getOrElse {
+      calculateTasksAndLimitingResource(sparkConf)
+      _maxTasksPerExecutor.get
+    }
+  }
+
+  // Returns whether the executor cores was available to use to calculate the max tasks
+  // per executor and limiting resource. Some cluster managers (like standalone and coarse
+  // grained mesos) don't use the cores config by default so we can't use it to calculate slots.
+  private[spark] def isCoresLimitKnown: Boolean = _coresLimitKnown
+
+  // The resource that has the least amount of slots per executor. Its possible multiple or all
+  // resources result in same number of slots and this could be any of those.
+  // If the executor cores config is not present this value is based on the other resources
+  // available or empty string if no other resources. You need to check the isCoresLimitKnown to
+  // calculate proper value.
+  private[spark] def limitingResource(sparkConf: SparkConf): String = {
+    _limitingResource.getOrElse {
+      calculateTasksAndLimitingResource(sparkConf)
+      _limitingResource.get
+    }
+  }
+
+  // executor cores config is not set for some masters by default and the default value
+  // only applies to yarn/k8s
+  private def shouldCheckExecutorCores(sparkConf: SparkConf): Boolean = {
+    val master = sparkConf.getOption("spark.master")
+    sparkConf.contains(EXECUTOR_CORES) ||
+      (master.isDefined && (master.get.equalsIgnoreCase("yarn") || master.get.startsWith("k8s")))
+  }
+
+  /**
+   * Utility function to calculate the number of tasks you can run on a single Executor based
+   * on the task and executor resource requests in the ResourceProfile. This will be based
+   * off the resource that is most restrictive. For instance, if the executor
+   * request is for 4 cpus and 2 gpus and your task request is for 1 cpu and 1 gpu each, the
+   * limiting resource is gpu and the number of tasks you can run on a single executor is 2.
+   * This function also sets the limiting resource, isCoresLimitKnown and number of slots per
+   * resource address.
+   */
+  private def calculateTasksAndLimitingResource(sparkConf: SparkConf): Unit = synchronized {
+    val shouldCheckExecCores = shouldCheckExecutorCores(sparkConf)
+    var (taskLimit, limitingResource) = if (shouldCheckExecCores) {
+      val cpusPerTask = taskResources.get(ResourceProfile.CPUS)
+        .map(_.amount).getOrElse(sparkConf.get(CPUS_PER_TASK).toDouble).toInt
+      assert(cpusPerTask > 0, "CPUs per task configuration has to be > 0")
+      val coresPerExecutor = getExecutorCores.getOrElse(sparkConf.get(EXECUTOR_CORES))
+      _coresLimitKnown = true
+      ResourceUtils.validateTaskCpusLargeEnough(coresPerExecutor, cpusPerTask)
+      val tasksBasedOnCores = coresPerExecutor / cpusPerTask
+      // Note that if the cores per executor aren't set properly this calculation could be off,
+      // we default it to just be 1 in order to allow checking of the rest of the custom
+      // resources. We set the limit based on the other resources available.
+      (tasksBasedOnCores, ResourceProfile.CPUS)
+    } else {
+      (-1, "")
+    }
+    val numPartsPerResourceMap = new mutable.HashMap[String, Int]
+    numPartsPerResourceMap(ResourceProfile.CORES) = 1
+    val taskResourcesToCheck = new mutable.HashMap[String, TaskResourceRequest]
+    taskResourcesToCheck ++= ResourceProfile.getCustomTaskResources(this)
+    val execResourceToCheck = ResourceProfile.getCustomExecutorResources(this)
+    execResourceToCheck.foreach { case (rName, execReq) =>
+      val taskReq = taskResources.get(rName).map(_.amount).getOrElse(0.0)
+      numPartsPerResourceMap(rName) = 1
+      if (taskReq > 0.0) {
+        if (taskReq > execReq.amount) {
+          throw new SparkException(s"The executor resource: $rName, amount: ${execReq.amount} " +
+            s"needs to be >= the task resource request amount of $taskReq")
+        }
+        val (numPerTask, parts) = ResourceUtils.calculateAmountAndPartsForFraction(taskReq)
+        numPartsPerResourceMap(rName) = parts
+        val numTasks = ((execReq.amount * parts) / numPerTask).toInt
+        if (taskLimit == -1 || numTasks < taskLimit) {
+          if (shouldCheckExecCores) {
+            // TODO - until resource profiles full implemented we need to error if cores not
+            // limiting resource because the scheduler code uses that for slots
+            throw new IllegalArgumentException("The number of slots on an executor has to be " +
+              "limited by the number of cores, otherwise you waste resources and " +
+              "dynamic allocation doesn't work properly. Your configuration has " +
+              s"core/task cpu slots = ${taskLimit} and " +
+              s"${execReq.resourceName} = ${numTasks}. " +
+              "Please adjust your configuration so that all resources require same number " +
+              "of executor slots.")
+          }
+          limitingResource = rName
+          taskLimit = numTasks
+        }
+        taskResourcesToCheck -= rName
+      } else {
+        logWarning(s"The executor resource config for resource: $rName was specified but " +
+          "no corresponding task resource request was specified.")
+      }
+    }
+    if(!shouldCheckExecCores) {
+      // if we can't rely on the executor cores config throw a warning for user
+      logWarning("Please ensure that the number of slots available on your " +
+        "executors is limited by the number of cores to task cpus and not another " +
+        "custom resource. If cores is not the limiting resource then dynamic " +
+        "allocation will not work properly!")
+    }
+    if (taskResourcesToCheck.nonEmpty) {
+      throw new SparkException("No executor resource configs were not specified for the " +
+        s"following task configs: ${taskResourcesToCheck.keys.mkString(",")}")
+    }
+    logInfo(s"Limiting resource is $limitingResource at $taskLimit tasks per executor")
+    _executorResourceSlotsPerAddr = Some(numPartsPerResourceMap.toMap)
+    _maxTasksPerExecutor = if (taskLimit == -1) Some(1) else Some(taskLimit)
+    _limitingResource = Some(limitingResource)
+    if (shouldCheckExecCores) {
+      ResourceUtils.warnOnWastedResources(this, sparkConf)
+    }
+  }
+
+  // to be used only by history server for reconstruction from events
+  private[spark] def setResourceProfileId(id: Int): Unit = {
+    _id = id
+  }
+
   // testing only
   private[spark] def setToDefaultProfile(): Unit = {
     _id = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
@@ -123,7 +264,7 @@ object ResourceProfile extends Logging {
           val taskResources = getDefaultTaskResources(conf)
           val executorResources = getDefaultExecutorResources(conf)
           val defProf = new ResourceProfile(executorResources, taskResources)
-          defProf.setToDefaultProfile
+          defProf.setToDefaultProfile()
           defaultProfile = Some(defProf)
           logInfo("Default ResourceProfile created, executor resources: " +
             s"${defProf.executorResources}, task resources: " +
@@ -157,13 +298,12 @@ object ResourceProfile extends Logging {
 
   // for testing only
   private[spark] def reInitDefaultProfile(conf: SparkConf): Unit = {
-    clearDefaultProfile
+    clearDefaultProfile()
     // force recreate it after clearing
     getOrCreateDefaultProfile(conf)
   }
 
-  // for testing only
-  private[spark] def clearDefaultProfile: Unit = {
+  private[spark] def clearDefaultProfile(): Unit = {
     DEFAULT_PROFILE_LOCK.synchronized {
       defaultProfile = None
     }
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala
index 0d55c176eeb65..26f23f4bf0476 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala
@@ -31,7 +31,7 @@ import org.apache.spark.annotation.Evolving
  * requirements between stages.
  */
 @Evolving
-class ResourceProfileBuilder() {
+private[spark] class ResourceProfileBuilder() {
 
   private val _taskResources = new ConcurrentHashMap[String, TaskResourceRequest]()
   private val _executorResources = new ConcurrentHashMap[String, ExecutorResourceRequest]()
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala
new file mode 100644
index 0000000000000..06db9468c451e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.resource
+
+import java.util.concurrent.ConcurrentHashMap
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.annotation.Evolving
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.Tests._
+import org.apache.spark.util.Utils
+import org.apache.spark.util.Utils.isTesting
+
+/**
+ * Manager of resource profiles. The manager allows one place to keep the actual ResourceProfiles
+ * and everywhere else we can use the ResourceProfile Id to save on space.
+ * Note we never remove a resource profile at this point. Its expected this number if small
+ * so this shouldn't be much overhead.
+ */
+@Evolving
+private[spark] class ResourceProfileManager(sparkConf: SparkConf) extends Logging {
+  private val resourceProfileIdToResourceProfile = new ConcurrentHashMap[Int, ResourceProfile]()
+
+  private val defaultProfile = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+  addResourceProfile(defaultProfile)
+
+  def defaultResourceProfile: ResourceProfile = defaultProfile
+
+  private val taskCpusDefaultProfile = defaultProfile.getTaskCpus.get
+  private val dynamicEnabled = Utils.isDynamicAllocationEnabled(sparkConf)
+  private val master = sparkConf.getOption("spark.master")
+  private val isNotYarn = master.isDefined && !master.get.equals("yarn")
+  private val errorForTesting = !isTesting || sparkConf.get(RESOURCE_PROFILE_MANAGER_TESTING)
+
+  // If we use anything except the default profile, its only supported on YARN right now.
+  // Throw an exception if not supported.
+  private[spark] def isSupported(rp: ResourceProfile): Boolean = {
+    val isNotDefaultProfile = rp.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+    val notYarnAndNotDefaultProfile = isNotDefaultProfile && isNotYarn
+    val YarnNotDynAllocAndNotDefaultProfile = isNotDefaultProfile && !isNotYarn && !dynamicEnabled
+    if (errorForTesting && (notYarnAndNotDefaultProfile || YarnNotDynAllocAndNotDefaultProfile)) {
+      throw new SparkException("ResourceProfiles are only supported on YARN with dynamic " +
+        "allocation enabled.")
+    }
+    true
+  }
+
+  def addResourceProfile(rp: ResourceProfile): Unit = {
+    isSupported(rp)
+    // force the computation of maxTasks and limitingResource now so we don't have cost later
+    rp.limitingResource(sparkConf)
+    logInfo(s"Adding ResourceProfile id: ${rp.id}")
+    resourceProfileIdToResourceProfile.putIfAbsent(rp.id, rp)
+  }
+
+  /*
+   * Gets the ResourceProfile associated with the id, if a profile doesn't exist
+   * it returns the default ResourceProfile created from the application level configs.
+   */
+  def resourceProfileFromId(rpId: Int): ResourceProfile = {
+    val rp = resourceProfileIdToResourceProfile.get(rpId)
+    if (rp == null) {
+      throw new SparkException(s"ResourceProfileId $rpId not found!")
+    }
+    rp
+  }
+
+  def taskCpusForProfileId(rpId: Int): Int = {
+    resourceProfileFromId(rpId).getTaskCpus.getOrElse(taskCpusDefaultProfile)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
index 7dd7fc1b99353..cdb761c7566e7 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
@@ -29,7 +29,8 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.resource.ResourceDiscoveryPlugin
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX}
+import org.apache.spark.internal.config.{CPUS_PER_TASK, EXECUTOR_CORES, RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX}
+import org.apache.spark.internal.config.Tests.{RESOURCES_WARNING_TESTING}
 import org.apache.spark.util.Utils
 
 /**
@@ -161,19 +162,23 @@ private[spark] object ResourceUtils extends Logging {
   }
 
   // Used to take a fraction amount from a task resource requirement and split into a real
-  // integer amount and the number of parts expected. For instance, if the amount is 0.5,
-  // the we get (1, 2) back out.
-  // Returns tuple of (amount, numParts)
-  def calculateAmountAndPartsForFraction(amount: Double): (Int, Int) = {
-    val parts = if (amount <= 0.5) {
-      Math.floor(1.0 / amount).toInt
-    } else if (amount % 1 != 0) {
+  // integer amount and the number of slots per address. For instance, if the amount is 0.5,
+  // the we get (1, 2) back out. This indicates that for each 1 address, it has 2 slots per
+  // address, which allows you to put 2 tasks on that address. Note if amount is greater
+  // than 1, then the number of slots per address has to be 1. This would indicate that a
+  // would have multiple addresses assigned per task. This can be used for calculating
+  // the number of tasks per executor -> (executorAmount * numParts) / (integer amount).
+  // Returns tuple of (integer amount, numParts)
+  def calculateAmountAndPartsForFraction(doubleAmount: Double): (Int, Int) = {
+    val parts = if (doubleAmount <= 0.5) {
+      Math.floor(1.0 / doubleAmount).toInt
+    } else if (doubleAmount % 1 != 0) {
       throw new SparkException(
-        s"The resource amount ${amount} must be either <= 0.5, or a whole number.")
+        s"The resource amount ${doubleAmount} must be either <= 0.5, or a whole number.")
     } else {
       1
     }
-    (Math.ceil(amount).toInt, parts)
+    (Math.ceil(doubleAmount).toInt, parts)
   }
 
   // Add any task resource requests from the spark conf to the TaskResourceRequests passed in
@@ -382,6 +387,90 @@ private[spark] object ResourceUtils extends Logging {
       s"${resourceRequest.id.resourceName}")
   }
 
+  def validateTaskCpusLargeEnough(execCores: Int, taskCpus: Int): Boolean = {
+    // Number of cores per executor must meet at least one task requirement.
+    if (execCores < taskCpus) {
+      throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " +
+        s"the number of cpus per task = $taskCpus.")
+    }
+    true
+  }
+
+  // the option executor cores parameter is by the different local modes since it not configured
+  // via the config
+  def warnOnWastedResources(
+      rp: ResourceProfile,
+      sparkConf: SparkConf,
+      execCores: Option[Int] = None): Unit = {
+    // There have been checks on the ResourceProfile to make sure the executor resources were
+    // specified and are large enough if any task resources were specified.
+    // Now just do some sanity test and log warnings when it looks like the user will
+    // waste some resources.
+    val coresKnown = rp.isCoresLimitKnown
+    var limitingResource = rp.limitingResource(sparkConf)
+    var maxTaskPerExec = rp.maxTasksPerExecutor(sparkConf)
+    val taskCpus = rp.getTaskCpus.getOrElse(sparkConf.get(CPUS_PER_TASK))
+    val cores = if (execCores.isDefined) {
+      execCores.get
+    } else if (coresKnown) {
+      rp.getExecutorCores.getOrElse(sparkConf.get(EXECUTOR_CORES))
+    } else {
+      // can't calculate cores limit
+      return
+    }
+    // when executor cores config isn't set, we can't calculate the real limiting resource
+    // and number of tasks per executor ahead of time, so calculate it now.
+    if (!coresKnown) {
+      val numTasksPerExecCores = cores / taskCpus
+      val numTasksPerExecCustomResource = rp.maxTasksPerExecutor(sparkConf)
+      if (limitingResource.isEmpty ||
+        (limitingResource.nonEmpty && numTasksPerExecCores < numTasksPerExecCustomResource)) {
+        limitingResource = ResourceProfile.CPUS
+        maxTaskPerExec = numTasksPerExecCores
+      }
+    }
+    val taskReq = ResourceProfile.getCustomTaskResources(rp)
+    val execReq = ResourceProfile.getCustomExecutorResources(rp)
+
+    if (limitingResource.nonEmpty && !limitingResource.equals(ResourceProfile.CPUS)) {
+      if ((taskCpus * maxTaskPerExec) < cores) {
+        val resourceNumSlots = Math.floor(cores/taskCpus).toInt
+        val message = s"The configuration of cores (exec = ${cores} " +
+          s"task = ${taskCpus}, runnable tasks = ${resourceNumSlots}) will " +
+          s"result in wasted resources due to resource ${limitingResource} limiting the " +
+          s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " +
+          "your configuration."
+        if (sparkConf.get(RESOURCES_WARNING_TESTING)) {
+          throw new SparkException(message)
+        } else {
+          logWarning(message)
+        }
+      }
+    }
+
+    taskReq.foreach { case (rName, treq) =>
+      val execAmount = execReq(rName).amount
+      val numParts = rp.getNumSlotsPerAddress(rName, sparkConf)
+      // handle fractional
+      val taskAmount = if (numParts > 1) 1 else treq.amount
+      if (maxTaskPerExec < (execAmount * numParts / taskAmount)) {
+        val taskReqStr = s"${taskAmount}/${numParts}"
+        val resourceNumSlots = Math.floor(execAmount * numParts / taskAmount).toInt
+        val message = s"The configuration of resource: ${treq.resourceName} " +
+          s"(exec = ${execAmount}, task = ${taskReqStr}, " +
+          s"runnable tasks = ${resourceNumSlots}) will " +
+          s"result in wasted resources due to resource ${limitingResource} limiting the " +
+          s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " +
+          "your configuration."
+        if (sparkConf.get(RESOURCES_WARNING_TESTING)) {
+          throw new SparkException(message)
+        } else {
+          logWarning(message)
+        }
+      }
+    }
+  }
+
   // known types of resources
   final val GPU: String = "gpu"
   final val FPGA: String = "fpga"
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 7bf363dd71c1b..fd5c3e0827bf9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -37,7 +37,8 @@ import org.apache.spark.internal.config
 import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
-import org.apache.spark.rdd.{DeterministicLevel, RDD, RDDCheckpointData}
+import org.apache.spark.rdd.{RDD, RDDCheckpointData}
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
@@ -391,7 +392,8 @@ private[spark] class DAGScheduler(
     val parents = getOrCreateParentStages(rdd, jobId)
     val id = nextStageId.getAndIncrement()
     val stage = new ShuffleMapStage(
-      id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker)
+      id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker,
+      ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     stageIdToStage(id) = stage
     shuffleIdToMapStage(shuffleDep.shuffleId) = stage
@@ -453,7 +455,8 @@ private[spark] class DAGScheduler(
     checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
     val parents = getOrCreateParentStages(rdd, jobId)
     val id = nextStageId.getAndIncrement()
-    val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
+    val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite,
+      ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     stageIdToStage(id) = stage
     updateJobIdStageIdMaps(jobId, stage)
     stage
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index d1687830ff7bf..7fdc3186e86bd 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -34,8 +34,9 @@ private[spark] class ResultStage(
     val partitions: Array[Int],
     parents: List[Stage],
     firstJobId: Int,
-    callSite: CallSite)
-  extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite) {
+    callSite: CallSite,
+    resourceProfileId: Int)
+  extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite, resourceProfileId) {
 
   /**
    * The active job for this result stage. Will be empty if the job has already finished
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 1b44d0aee3195..be1984de9837f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -42,8 +42,9 @@ private[spark] class ShuffleMapStage(
     firstJobId: Int,
     callSite: CallSite,
     val shuffleDep: ShuffleDependency[_, _, _],
-    mapOutputTrackerMaster: MapOutputTrackerMaster)
-  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
+    mapOutputTrackerMaster: MapOutputTrackerMaster,
+    resourceProfileId: Int)
+  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite, resourceProfileId) {
 
   private[this] var _mapStageJobs: List[ActiveJob] = Nil
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index a9f72eae71368..ae7924d66a301 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -59,7 +59,8 @@ private[scheduler] abstract class Stage(
     val numTasks: Int,
     val parents: List[Stage],
     val firstJobId: Int,
-    val callSite: CallSite)
+    val callSite: CallSite,
+    val resourceProfileId: Int)
   extends Logging {
 
   val numPartitions = rdd.partitions.length
@@ -79,7 +80,8 @@ private[scheduler] abstract class Stage(
    * StageInfo to tell SparkListeners when a job starts (which happens before any stage attempts
    * have been created).
    */
-  private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
+  private var _latestInfo: StageInfo =
+    StageInfo.fromStage(this, nextAttemptId, resourceProfileId = resourceProfileId)
 
   /**
    * Set of stage attempt IDs that have failed. We keep track of these failures in order to avoid
@@ -100,7 +102,8 @@ private[scheduler] abstract class Stage(
     val metrics = new TaskMetrics
     metrics.register(rdd.sparkContext)
     _latestInfo = StageInfo.fromStage(
-      this, nextAttemptId, Some(numPartitionsToCompute), metrics, taskLocalityPreferences)
+      this, nextAttemptId, Some(numPartitionsToCompute), metrics, taskLocalityPreferences,
+      resourceProfileId = resourceProfileId)
     nextAttemptId += 1
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index fdc50328b43d8..556478d83cf39 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -38,7 +38,8 @@ class StageInfo(
     val details: String,
     val taskMetrics: TaskMetrics = null,
     private[spark] val taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty,
-    private[spark] val shuffleDepId: Option[Int] = None) {
+    private[spark] val shuffleDepId: Option[Int] = None,
+    val resourceProfileId: Int) {
   /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */
   var submissionTime: Option[Long] = None
   /** Time when all tasks in the stage completed or when the stage was cancelled. */
@@ -87,7 +88,8 @@ private[spark] object StageInfo {
       attemptId: Int,
       numTasks: Option[Int] = None,
       taskMetrics: TaskMetrics = null,
-      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
+      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty,
+      resourceProfileId: Int
     ): StageInfo = {
     val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
     val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
@@ -105,6 +107,7 @@ private[spark] object StageInfo {
       stage.details,
       taskMetrics,
       taskLocalityPreferences,
-      shuffleDepId)
+      shuffleDepId,
+      resourceProfileId)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 6a1d460e6a9d9..bf92081d13907 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -384,7 +384,9 @@ private[spark] class TaskSchedulerImpl(
    */
   private def resourcesMeetTaskRequirements(resources: Map[String, Buffer[String]]): Boolean = {
     val resourcesFree = resources.map(r => r._1 -> r._2.length)
-    ResourceUtils.resourcesMeetRequirements(resourcesFree, resourcesReqsPerTask)
+    val meetsReqs = ResourceUtils.resourcesMeetRequirements(resourcesFree, resourcesReqsPerTask)
+    logDebug(s"Resources meet task requirements is: $meetsReqs")
+    meetsReqs
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 55f4005ef1b45..63aa04986b073 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -69,13 +69,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     conf.get(SCHEDULER_MAX_REGISTERED_RESOURCE_WAITING_TIME))
   private val createTimeNs = System.nanoTime()
 
-  private val taskResourceNumParts: Map[String, Int] =
-    if (scheduler.resourcesReqsPerTask != null) {
-      scheduler.resourcesReqsPerTask.map(req => req.resourceName -> req.numParts).toMap
-    } else {
-      Map.empty
-    }
-
   // Accessing `executorDataMap` in the inherited methods from ThreadSafeRpcEndpoint doesn't need
   // any protection. But accessing `executorDataMap` out of the inherited methods must be
   // protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should only
@@ -83,13 +76,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // `CoarseGrainedSchedulerBackend.this`.
   private val executorDataMap = new HashMap[String, ExecutorData]
 
-  // Number of executors requested by the cluster manager, [[ExecutorAllocationManager]]
-  @GuardedBy("CoarseGrainedSchedulerBackend.this")
-  private var requestedTotalExecutors = 0
-
-  // Number of executors requested from the cluster manager that have not registered yet
+  // Number of executors for each ResourceProfile requested by the cluster
+  // manager, [[ExecutorAllocationManager]]
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
-  private var numPendingExecutors = 0
+  private val requestedTotalExecutorsPerResourceProfile = new HashMap[ResourceProfile, Int]
 
   private val listenerBus = scheduler.sc.listenerBus
 
@@ -102,13 +92,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // Executors that have been lost, but for which we don't yet know the real exit reason.
   private val executorsPendingLossReason = new HashSet[String]
 
-  // A map to store hostname with its possible task number running on it
+  // A map of ResourceProfile id to map of hostname with its possible task number running on it
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
-  protected var hostToLocalTaskCount: Map[String, Int] = Map.empty
+  protected var rpHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty
 
-  // The number of pending tasks which is locality required
+  // The number of pending tasks per ResourceProfile id which is locality required
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
-  protected var localityAwareTasks = 0
+  protected var numLocalityAwareTasksPerResourceProfileId = Map.empty[Int, Int]
 
   // The num of current max ExecutorId used to re-register appMaster
   @volatile protected var currentExecutorIdCounter = 0
@@ -223,16 +213,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             } else {
               context.senderAddress
             }
-          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
+          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId, " +
+            s" ResourceProfileId $resourceProfileId")
           addressToExecutorId(executorAddress) = executorId
           totalCoreCount.addAndGet(cores)
           totalRegisteredExecutors.addAndGet(1)
-          val resourcesInfo = resources.map{ case (k, v) =>
-            (v.name,
-             new ExecutorResourceInfo(v.name, v.addresses,
-               // tell the executor it can schedule resources up to numParts times,
-               // as configured by the user, or set to 1 as that is the default (1 task/resource)
-               taskResourceNumParts.getOrElse(v.name, 1)))
+          val resourcesInfo = resources.map { case (rName, info) =>
+            // tell the executor it can schedule resources up to numParts times,
+            // as configured by the user, or set to 1 as that is the default (1 task/resource)
+            val numParts = scheduler.sc.resourceProfileManager
+              .resourceProfileFromId(resourceProfileId).getNumSlotsPerAddress(rName, conf)
+            (info.name, new ExecutorResourceInfo(info.name, info.addresses, numParts))
           }
           val data = new ExecutorData(executorRef, executorAddress, hostname,
             0, cores, logUrlHandler.applyPattern(logUrls, attributes), attributes,
@@ -244,10 +235,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             if (currentExecutorIdCounter < executorId.toInt) {
               currentExecutorIdCounter = executorId.toInt
             }
-            if (numPendingExecutors > 0) {
-              numPendingExecutors -= 1
-              logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
-            }
           }
           // Note: some tests expect the reply to come after we put the executor in the map
           context.reply(true)
@@ -271,10 +258,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         context.reply(true)
 
       case RetrieveSparkAppConfig(resourceProfileId) =>
-        // note this will be updated in later prs to get the ResourceProfile from a
-        // ResourceProfileManager that matches the resource profile id
-        // for now just use default profile
-        val rp = ResourceProfile.getOrCreateDefaultProfile(conf)
+        val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(resourceProfileId)
         val reply = SparkAppConfig(
           sparkProperties,
           SparkEnv.get.securityManager.getIOEncryptionKey(),
@@ -494,8 +478,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * */
   protected[scheduler] def reset(): Unit = {
     val executors: Set[String] = synchronized {
-      requestedTotalExecutors = 0
-      numPendingExecutors = 0
+      requestedTotalExecutorsPerResourceProfile.clear()
       executorDataMap.keys.toSet
     }
 
@@ -577,12 +560,14 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
   // this function is for testing only
   def getExecutorResourceProfileId(executorId: String): Int = synchronized {
-    val res = executorDataMap.get(executorId)
-    res.map(_.resourceProfileId).getOrElse(ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID)
+    val execDataOption = executorDataMap.get(executorId)
+    execDataOption.map(_.resourceProfileId).getOrElse(ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID)
   }
 
   /**
-   * Request an additional number of executors from the cluster manager.
+   * Request an additional number of executors from the cluster manager. This is
+   * requesting against the default ResourceProfile, we will need an API change to
+   * allow against other profiles.
    * @return whether the request is acknowledged.
    */
   final override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
@@ -594,21 +579,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
 
     val response = synchronized {
-      requestedTotalExecutors += numAdditionalExecutors
-      numPendingExecutors += numAdditionalExecutors
-      logDebug(s"Number of pending executors is now $numPendingExecutors")
-      if (requestedTotalExecutors !=
-          (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
-        logDebug(
-          s"""requestExecutors($numAdditionalExecutors): Executor request doesn't match:
-             |requestedTotalExecutors  = $requestedTotalExecutors
-             |numExistingExecutors     = $numExistingExecutors
-             |numPendingExecutors      = $numPendingExecutors
-             |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
-      }
-
+      val defaultProf = scheduler.sc.resourceProfileManager.defaultResourceProfile
+      val numExisting = requestedTotalExecutorsPerResourceProfile.getOrElse(defaultProf, 0)
+      requestedTotalExecutorsPerResourceProfile(defaultProf) = numExisting + numAdditionalExecutors
       // Account for executors pending to be added or removed
-      doRequestTotalExecutors(requestedTotalExecutors)
+      doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap)
     }
 
     defaultAskTimeout.awaitResult(response)
@@ -617,39 +592,41 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   /**
    * Update the cluster manager on our scheduling needs. Three bits of information are included
    * to help it make decisions.
-   * @param numExecutors The total number of executors we'd like to have. The cluster manager
-   *                     shouldn't kill any running executor to reach this number, but,
-   *                     if all existing executors were to die, this is the number of executors
-   *                     we'd want to be allocated.
-   * @param localityAwareTasks The number of tasks in all active stages that have a locality
-   *                           preferences. This includes running, pending, and completed tasks.
+   * @param resourceProfileToNumExecutors The total number of executors we'd like to have per
+   *                                      ResourceProfile. The cluster manager shouldn't kill any
+   *                                      running executor to reach this number, but, if all
+   *                                      existing executors were to die, this is the number
+   *                                      of executors we'd want to be allocated.
+   * @param numLocalityAwareTasksPerResourceProfileId The number of tasks in all active stages that
+   *                                                  have a locality preferences per
+   *                                                  ResourceProfile. This includes running,
+   *                                                  pending, and completed tasks.
    * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
    *                             that would like to like to run on that host.
    *                             This includes running, pending, and completed tasks.
    * @return whether the request is acknowledged by the cluster manager.
    */
   final override def requestTotalExecutors(
-      numExecutors: Int,
-      localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int]
-    ): Boolean = {
-    if (numExecutors < 0) {
+      resourceProfileIdToNumExecutors: Map[Int, Int],
+      numLocalityAwareTasksPerResourceProfileId: Map[Int, Int],
+      hostToLocalTaskCount: Map[Int, Map[String, Int]]
+  ): Boolean = {
+    val totalExecs = resourceProfileIdToNumExecutors.values.sum
+    if (totalExecs < 0) {
       throw new IllegalArgumentException(
         "Attempted to request a negative number of executor(s) " +
-          s"$numExecutors from the cluster manager. Please specify a positive number!")
+          s"$totalExecs from the cluster manager. Please specify a positive number!")
+    }
+    val resourceProfileToNumExecutors = resourceProfileIdToNumExecutors.map { case (rpid, num) =>
+      (scheduler.sc.resourceProfileManager.resourceProfileFromId(rpid), num)
     }
-
     val response = synchronized {
-      this.requestedTotalExecutors = numExecutors
-      this.localityAwareTasks = localityAwareTasks
-      this.hostToLocalTaskCount = hostToLocalTaskCount
-
-      numPendingExecutors =
-        math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0)
-
-      doRequestTotalExecutors(numExecutors)
+      this.requestedTotalExecutorsPerResourceProfile.clear()
+      this.requestedTotalExecutorsPerResourceProfile ++= resourceProfileToNumExecutors
+      this.numLocalityAwareTasksPerResourceProfileId = numLocalityAwareTasksPerResourceProfileId
+      this.rpHostToLocalTaskCount = hostToLocalTaskCount
+      doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap)
     }
-
     defaultAskTimeout.awaitResult(response)
   }
 
@@ -665,7 +642,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    *
    * @return a future whose evaluation indicates whether the request is acknowledged.
    */
-  protected def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] =
+  protected def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] =
     Future.successful(false)
 
   /**
@@ -706,20 +684,20 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       // take into account executors that are pending to be added or removed.
       val adjustTotalExecutors =
         if (adjustTargetNumExecutors) {
-          requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
-          if (requestedTotalExecutors !=
-              (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
-            logDebug(
-              s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force):
-                 |Executor counts do not match:
-                 |requestedTotalExecutors  = $requestedTotalExecutors
-                 |numExistingExecutors     = $numExistingExecutors
-                 |numPendingExecutors      = $numPendingExecutors
-                 |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
+          executorsToKill.foreach { exec =>
+            val rpId = executorDataMap(exec).resourceProfileId
+            val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(rpId)
+            if (requestedTotalExecutorsPerResourceProfile.isEmpty) {
+              // Assume that we are killing an executor that was started by default and
+              // not through the request api
+              requestedTotalExecutorsPerResourceProfile(rp) = 0
+            } else {
+              val requestedTotalForRp = requestedTotalExecutorsPerResourceProfile(rp)
+              requestedTotalExecutorsPerResourceProfile(rp) = math.max(requestedTotalForRp - 1, 0)
+            }
           }
-          doRequestTotalExecutors(requestedTotalExecutors)
+          doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap)
         } else {
-          numPendingExecutors += executorsToKill.size
           Future.successful(true)
         }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index a9b607d8cc38c..d91d78b29f98d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -28,7 +28,7 @@ import org.apache.spark.deploy.client.{StandaloneAppClient, StandaloneAppClientL
 import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
-import org.apache.spark.resource.ResourceUtils
+import org.apache.spark.resource.{ResourceProfile, ResourceUtils}
 import org.apache.spark.rpc.RpcEndpointAddress
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
@@ -58,6 +58,7 @@ private[spark] class StandaloneSchedulerBackend(
 
   private val maxCores = conf.get(config.CORES_MAX)
   private val totalExpectedCores = maxCores.getOrElse(0)
+  private val defaultProf = sc.resourceProfileManager.defaultResourceProfile
 
   override def start(): Unit = {
     super.start()
@@ -194,9 +195,13 @@ private[spark] class StandaloneSchedulerBackend(
    *
    * @return whether the request is acknowledged.
    */
-  protected override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
+  protected override def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = {
+    // resources profiles not supported
     Option(client) match {
-      case Some(c) => c.requestTotalExecutors(requestedTotal)
+      case Some(c) =>
+        val numExecs = resourceProfileToTotalExecs.getOrElse(defaultProf, 0)
+        c.requestTotalExecutors(numExecs)
       case None =>
         logWarning("Attempted to request executors before driver fully initialized.")
         Future.successful(false)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
index a24f1902faa31..c29546b7577fc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
@@ -70,7 +70,7 @@ private[spark] class ExecutorMonitor(
   // this listener. There are safeguards in other parts of the code that would prevent that executor
   // from being removed.
   private val nextTimeout = new AtomicLong(Long.MaxValue)
-  private var timedOutExecs = Seq.empty[String]
+  private var timedOutExecs = Seq.empty[(String, Int)]
 
   // Active job tracking.
   //
@@ -100,10 +100,10 @@ private[spark] class ExecutorMonitor(
   }
 
   /**
-   * Returns the list of executors that are currently considered to be timed out.
-   * Should only be called from the EAM thread.
+   * Returns the list of executors and their ResourceProfile id that are currently considered to
+   * be timed out. Should only be called from the EAM thread.
    */
-  def timedOutExecutors(): Seq[String] = {
+  def timedOutExecutors(): Seq[(String, Int)] = {
     val now = clock.nanoTime()
     if (now >= nextTimeout.get()) {
       // Temporarily set the next timeout at Long.MaxValue. This ensures that after
@@ -126,7 +126,7 @@ private[spark] class ExecutorMonitor(
             true
           }
         }
-        .keys
+        .map { case (name, exec) => (name, exec.resourceProfileId)}
         .toSeq
       updateNextTimeout(newNextTimeout)
     }
@@ -155,6 +155,7 @@ private[spark] class ExecutorMonitor(
     execResourceProfileCount.getOrDefault(id, 0)
   }
 
+  // for testing
   def getResourceProfileId(executorId: String): Int = {
     val execTrackingInfo = executors.get(executorId)
     if (execTrackingInfo != null) {
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 4d89c4f079f29..53824735d2fc5 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -33,7 +33,7 @@ import org.apache.spark._
 import org.apache.spark.executor._
 import org.apache.spark.metrics.ExecutorMetricType
 import org.apache.spark.rdd.RDDOperationScope
-import org.apache.spark.resource.ResourceInformation
+import org.apache.spark.resource.{ResourceInformation, ResourceProfile}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.storage._
@@ -662,7 +662,8 @@ private[spark] object JsonProtocol {
     val stageInfos = jsonOption(json \ "Stage Infos")
       .map(_.extract[Seq[JValue]].map(stageInfoFromJson)).getOrElse {
         stageIds.map { id =>
-          new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown")
+          new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown",
+            resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
         }
       }
     SparkListenerJobStart(jobId, submissionTime, stageInfos, properties)
@@ -803,7 +804,8 @@ private[spark] object JsonProtocol {
     }
 
     val stageInfo = new StageInfo(
-      stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details)
+      stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details,
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     stageInfo.submissionTime = submissionTime
     stageInfo.completionTime = completionTime
     stageInfo.failureReason = failureReason
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 8d958494d52be..8fa33f4915ea4 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.config
 import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL
 import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder, ResourceProfileManager, TaskResourceRequests}
 import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -45,6 +46,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
   private val managers = new mutable.ListBuffer[ExecutorAllocationManager]()
   private var listenerBus: LiveListenerBus = _
   private var client: ExecutorAllocationClient = _
+  private val clock = new SystemClock()
+  private var rpManager: ResourceProfileManager = _
+
 
   override def beforeEach(): Unit = {
     super.beforeEach()
@@ -108,65 +112,257 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
 
   test("starting state") {
     val manager = createManager(createConf())
-    assert(numExecutorsTarget(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
     assert(executorsPendingToRemove(manager).isEmpty)
     assert(addTime(manager) === ExecutorAllocationManager.NOT_SET)
   }
 
-  test("add executors") {
+  test("add executors default profile") {
     val manager = createManager(createConf(1, 10, 1))
     post(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
+    // Keep adding until the limit is reached
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 4)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 8)
+    // reached the limit of 10
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+
+    // Register previously requested executors
+    onExecutorAddedDefaultProfile(manager, "first")
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    onExecutorAddedDefaultProfile(manager, "second")
+    onExecutorAddedDefaultProfile(manager, "third")
+    onExecutorAddedDefaultProfile(manager, "fourth")
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    onExecutorAddedDefaultProfile(manager, "first") // duplicates should not count
+    onExecutorAddedDefaultProfile(manager, "second")
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+
+    // Try adding again
+    // This should still fail because the number pending + running is still at the limit
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+  }
+
+  test("add executors multiple profiles") {
+    val manager = createManager(createConf(1, 10, 1))
+    post(SparkListenerStageSubmitted(createStageInfo(0, 1000, rp = defaultProfile)))
+    val rp1 = new ResourceProfileBuilder()
+    val execReqs = new ExecutorResourceRequests().cores(4).resource("gpu", 4)
+    val taskReqs = new TaskResourceRequests().cpus(1).resource("gpu", 1)
+    rp1.require(execReqs).require(taskReqs)
+    val rprof1 = rp1.build
+    rpManager.addResourceProfile(rprof1)
+    post(SparkListenerStageSubmitted(createStageInfo(1, 1000, rp = rprof1)))
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
     // Keep adding until the limit is reached
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 2)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 4)
-    assert(numExecutorsToAdd(manager) === 4)
-    assert(addExecutors(manager) === 4)
-    assert(numExecutorsTarget(manager) === 8)
-    assert(numExecutorsToAdd(manager) === 8)
-    assert(addExecutors(manager) === 2) // reached the limit of 10
-    assert(numExecutorsTarget(manager) === 10)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsTarget(manager) === 10)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    assert(numExecutorsToAdd(manager, rprof1) === 1)
+    assert(numExecutorsTarget(manager, rprof1.id) === 1)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    assert(numExecutorsToAdd(manager, rprof1) === 2)
+    assert(numExecutorsTarget(manager, rprof1.id) === 2)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 4)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4)
+    assert(numExecutorsToAdd(manager, rprof1) === 4)
+    assert(numExecutorsTarget(manager, rprof1.id) === 4)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 4)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 8)
+    // reached the limit of 10
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    assert(numExecutorsToAdd(manager, rprof1) === 8)
+    assert(numExecutorsTarget(manager, rprof1.id) === 8)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    assert(numExecutorsToAdd(manager, rprof1) === 1)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(numExecutorsToAdd(manager, rprof1) === 1)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
 
     // Register previously requested executors
-    onExecutorAdded(manager, "first")
-    assert(numExecutorsTarget(manager) === 10)
-    onExecutorAdded(manager, "second")
-    onExecutorAdded(manager, "third")
-    onExecutorAdded(manager, "fourth")
-    assert(numExecutorsTarget(manager) === 10)
-    onExecutorAdded(manager, "first") // duplicates should not count
-    onExecutorAdded(manager, "second")
-    assert(numExecutorsTarget(manager) === 10)
+    onExecutorAddedDefaultProfile(manager, "first")
+    onExecutorAdded(manager, "firstrp1", rprof1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
+    onExecutorAddedDefaultProfile(manager, "second")
+    onExecutorAddedDefaultProfile(manager, "third")
+    onExecutorAddedDefaultProfile(manager, "fourth")
+    onExecutorAdded(manager, "secondrp1", rprof1)
+    onExecutorAdded(manager, "thirdrp1", rprof1)
+    onExecutorAdded(manager, "fourthrp1", rprof1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
+    onExecutorAddedDefaultProfile(manager, "first") // duplicates should not count
+    onExecutorAddedDefaultProfile(manager, "second")
+    onExecutorAdded(manager, "firstrp1", rprof1)
+    onExecutorAdded(manager, "secondrp1", rprof1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
 
     // Try adding again
     // This should still fail because the number pending + running is still at the limit
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsTarget(manager) === 10)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsTarget(manager) === 10)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(numExecutorsToAdd(manager, rprof1) === 1)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(numExecutorsToAdd(manager, rprof1) === 1)
+    assert(numExecutorsTarget(manager, rprof1.id) === 10)
+  }
+
+  test("remove executors multiple profiles") {
+    val manager = createManager(createConf(5, 10, 5))
+    val rp1 = new ResourceProfileBuilder()
+    val execReqs = new ExecutorResourceRequests().cores(4).resource("gpu", 4)
+    val taskReqs = new TaskResourceRequests().cpus(1).resource("gpu", 1)
+    rp1.require(execReqs).require(taskReqs)
+    val rprof1 = rp1.build
+    val rp2 = new ResourceProfileBuilder()
+    val execReqs2 = new ExecutorResourceRequests().cores(1)
+    val taskReqs2 = new TaskResourceRequests().cpus(1)
+    rp2.require(execReqs2).require(taskReqs2)
+    val rprof2 = rp2.build
+    rpManager.addResourceProfile(rprof1)
+    rpManager.addResourceProfile(rprof2)
+    post(SparkListenerStageSubmitted(createStageInfo(1, 10, rp = rprof1)))
+    post(SparkListenerStageSubmitted(createStageInfo(2, 10, rp = rprof2)))
+
+    (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id, rprof1) }
+    (11 to 20).map(_.toString).foreach { id => onExecutorAdded(manager, id, rprof2) }
+    (21 to 30).map(_.toString).foreach { id => onExecutorAdded(manager, id, defaultProfile) }
+
+    // Keep removing until the limit is reached
+    assert(executorsPendingToRemove(manager).isEmpty)
+    assert(removeExecutor(manager, "1", rprof1.id))
+    assert(executorsPendingToRemove(manager).size === 1)
+    assert(executorsPendingToRemove(manager).contains("1"))
+    assert(removeExecutor(manager, "11", rprof2.id))
+    assert(removeExecutor(manager, "2", rprof1.id))
+    assert(executorsPendingToRemove(manager).size === 3)
+    assert(executorsPendingToRemove(manager).contains("2"))
+    assert(executorsPendingToRemove(manager).contains("11"))
+    assert(removeExecutor(manager, "21", defaultProfile.id))
+    assert(removeExecutor(manager, "3", rprof1.id))
+    assert(removeExecutor(manager, "4", rprof1.id))
+    assert(executorsPendingToRemove(manager).size === 6)
+    assert(executorsPendingToRemove(manager).contains("21"))
+    assert(executorsPendingToRemove(manager).contains("3"))
+    assert(executorsPendingToRemove(manager).contains("4"))
+    assert(removeExecutor(manager, "5", rprof1.id))
+    assert(!removeExecutor(manager, "6", rprof1.id)) // reached the limit of 5
+    assert(executorsPendingToRemove(manager).size === 7)
+    assert(executorsPendingToRemove(manager).contains("5"))
+    assert(!executorsPendingToRemove(manager).contains("6"))
+
+    // Kill executors previously requested to remove
+    onExecutorRemoved(manager, "1")
+    assert(executorsPendingToRemove(manager).size === 6)
+    assert(!executorsPendingToRemove(manager).contains("1"))
+    onExecutorRemoved(manager, "2")
+    onExecutorRemoved(manager, "3")
+    assert(executorsPendingToRemove(manager).size === 4)
+    assert(!executorsPendingToRemove(manager).contains("2"))
+    assert(!executorsPendingToRemove(manager).contains("3"))
+    onExecutorRemoved(manager, "2") // duplicates should not count
+    onExecutorRemoved(manager, "3")
+    assert(executorsPendingToRemove(manager).size === 4)
+    onExecutorRemoved(manager, "4")
+    onExecutorRemoved(manager, "5")
+    assert(executorsPendingToRemove(manager).size === 2)
+    assert(executorsPendingToRemove(manager).contains("11"))
+    assert(executorsPendingToRemove(manager).contains("21"))
+
+    // Try removing again
+    // This should still fail because the number pending + running is still at the limit
+    assert(!removeExecutor(manager, "7", rprof1.id))
+    assert(executorsPendingToRemove(manager).size === 2)
+    assert(!removeExecutor(manager, "8", rprof1.id))
+    assert(executorsPendingToRemove(manager).size === 2)
+
+    // make sure rprof2 has the same min limit or 5
+    assert(removeExecutor(manager, "12", rprof2.id))
+    assert(removeExecutor(manager, "13", rprof2.id))
+    assert(removeExecutor(manager, "14", rprof2.id))
+    assert(removeExecutor(manager, "15", rprof2.id))
+    assert(!removeExecutor(manager, "16", rprof2.id)) // reached the limit of 5
+    assert(executorsPendingToRemove(manager).size === 6)
+    assert(!executorsPendingToRemove(manager).contains("16"))
+    onExecutorRemoved(manager, "11")
+    onExecutorRemoved(manager, "12")
+    onExecutorRemoved(manager, "13")
+    onExecutorRemoved(manager, "14")
+    onExecutorRemoved(manager, "15")
+    assert(executorsPendingToRemove(manager).size === 1)
   }
 
   def testAllocationRatio(cores: Int, divisor: Double, expected: Int): Unit = {
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
     val conf = createConf(3, 15)
       .set(config.DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO, divisor)
       .set(config.EXECUTOR_CORES, cores)
     val manager = createManager(conf)
     post(SparkListenerStageSubmitted(createStageInfo(0, 20)))
     for (i <- 0 to 5) {
-      addExecutors(manager)
+      addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+      doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
     }
-    assert(numExecutorsTarget(manager) === expected)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === expected)
   }
 
   test("executionAllocationRatio is correctly handled") {
@@ -185,127 +381,158 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val manager = createManager(createConf(0, 10, 0))
     post(SparkListenerStageSubmitted(createStageInfo(0, 5)))
 
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
     // Verify that we're capped at number of tasks in the stage
-    assert(numExecutorsTarget(manager) === 0)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 3)
-    assert(numExecutorsToAdd(manager) === 4)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 5)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
 
     // Verify that running a task doesn't affect the target
     post(SparkListenerStageSubmitted(createStageInfo(1, 3)))
     post(SparkListenerExecutorAdded(
       0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty, Map.empty)))
     post(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
-    assert(numExecutorsTarget(manager) === 5)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 6)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 8)
-    assert(numExecutorsToAdd(manager) === 4)
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsTarget(manager) === 8)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 6)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
 
     // Verify that re-running a task doesn't blow things up
     post(SparkListenerStageSubmitted(createStageInfo(2, 3)))
     post(SparkListenerTaskStart(2, 0, createTaskInfo(0, 0, "executor-1")))
     post(SparkListenerTaskStart(2, 0, createTaskInfo(1, 0, "executor-1")))
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 9)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 10)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 9)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
 
     // Verify that running a task once we're at our limit doesn't blow things up
     post(SparkListenerTaskStart(2, 0, createTaskInfo(0, 1, "executor-1")))
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsTarget(manager) === 10)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 10)
   }
 
   test("add executors when speculative tasks added") {
     val manager = createManager(createConf(0, 10, 0))
 
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
+    post(SparkListenerStageSubmitted(createStageInfo(1, 2)))
     // Verify that we're capped at number of tasks including the speculative ones in the stage
     post(SparkListenerSpeculativeTaskSubmitted(1))
-    assert(numExecutorsTarget(manager) === 0)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
     post(SparkListenerSpeculativeTaskSubmitted(1))
     post(SparkListenerSpeculativeTaskSubmitted(1))
-    post(SparkListenerStageSubmitted(createStageInfo(1, 2)))
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 3)
-    assert(numExecutorsToAdd(manager) === 4)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 5)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
 
     // Verify that running a task doesn't affect the target
     post(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
-    assert(numExecutorsTarget(manager) === 5)
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
 
     // Verify that running a speculative task doesn't affect the target
     post(SparkListenerTaskStart(1, 0, createTaskInfo(1, 0, "executor-2", true)))
-    assert(numExecutorsTarget(manager) === 5)
-    assert(addExecutors(manager) === 0)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
   }
 
   test("SPARK-30511 remove executors when speculative tasks end") {
     val clock = new ManualClock()
     val stage = createStageInfo(0, 40)
-    val manager = createManager(createConf(0, 10, 0).set(config.EXECUTOR_CORES, 4), clock = clock)
+    val conf = createConf(0, 10, 0).set(config.EXECUTOR_CORES, 4)
+    val manager = createManager(conf, clock = clock)
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
 
     post(SparkListenerStageSubmitted(stage))
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 2)
-    assert(addExecutors(manager) === 4)
-    assert(addExecutors(manager) === 3)
-
-    (0 to 9).foreach(execId => onExecutorAdded(manager, execId.toString))
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 3)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+
+    (0 to 9).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString))
     (0 to 39).map { i => createTaskInfo(i, i, executorId = s"${i / 4}")}.foreach {
       info => post(SparkListenerTaskStart(0, 0, info))
     }
-    assert(numExecutorsTarget(manager) === 10)
-    assert(maxNumExecutorsNeeded(manager) == 10)
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 10)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 10)
 
     // 30 tasks (0 - 29) finished
     (0 to 29).map { i => createTaskInfo(i, i, executorId = s"${i / 4}")}.foreach {
       info => post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null)) }
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 3)
-    assert(maxNumExecutorsNeeded(manager) == 3)
-    (0 to 6).foreach { i => assert(removeExecutor(manager, i.toString))}
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 3)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 3)
+    (0 to 6).foreach { i => assert(removeExecutorDefaultProfile(manager, i.toString))}
     (0 to 6).foreach { i => onExecutorRemoved(manager, i.toString)}
 
     // 10 speculative tasks (30 - 39) launch for the remaining tasks
     (30 to 39).foreach { _ => post(SparkListenerSpeculativeTaskSubmitted(0))}
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) == 5)
-    assert(maxNumExecutorsNeeded(manager) == 5)
-    (10 to 12).foreach(execId => onExecutorAdded(manager, execId.toString))
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTarget(manager, defaultProfile.id) == 5)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5)
+    (10 to 12).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString))
     (40 to 49).map { i =>
       createTaskInfo(taskId = i, taskIndex = i - 10, executorId = s"${i / 4}", speculative = true)}
       .foreach { info => post(SparkListenerTaskStart(0, 0, info))}
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) == 5) // At this point, we still have 6 executors running
-    assert(maxNumExecutorsNeeded(manager) == 5)
+    // At this point, we still have 6 executors running
+    assert(numExecutorsTarget(manager, defaultProfile.id) == 5)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5)
 
     // 6 speculative tasks (40 - 45) finish before the original tasks, with 4 speculative remaining
     (40 to 45).map { i =>
@@ -314,9 +541,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
         info => post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null))}
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 4)
-    assert(maxNumExecutorsNeeded(manager) == 4)
-    assert(removeExecutor(manager, "10"))
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 4)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 4)
+    assert(removeExecutorDefaultProfile(manager, "10"))
     onExecutorRemoved(manager, "10")
     // At this point, we still have 5 executors running: ["7", "8", "9", "11", "12"]
 
@@ -327,9 +554,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
         SparkListenerTaskEnd(0, 0, null, TaskKilled("test"), info, new ExecutorMetrics, null))}
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 2)
-    assert(maxNumExecutorsNeeded(manager) == 2)
-    (7 to 8).foreach { i => assert(removeExecutor(manager, i.toString))}
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 2)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2)
+    (7 to 8).foreach { i => assert(removeExecutorDefaultProfile(manager, i.toString))}
     (7 to 8).foreach { i => onExecutorRemoved(manager, i.toString)}
     // At this point, we still have 3 executors running: ["9", "11", "12"]
 
@@ -343,8 +570,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     // tasks running. Target lowers to 2, but still hold 3 executors ["9", "11", "12"]
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 2)
-    assert(maxNumExecutorsNeeded(manager) == 2)
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 2)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2)
     // At this point, we still have 3 executors running: ["9", "11", "12"]
 
     // Task 37 and 47 succeed at the same time
@@ -357,9 +584,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     // tasks running
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 1)
-    assert(maxNumExecutorsNeeded(manager) == 1)
-    assert(removeExecutor(manager, "11"))
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 1)
+    assert(removeExecutorDefaultProfile(manager, "11"))
     onExecutorRemoved(manager, "11")
     // At this point, we still have 2 executors running: ["9", "12"]
 
@@ -372,14 +599,14 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
     // maxNeeded = 1, allocate one more to satisfy speculation locality requirement
-    assert(numExecutorsTarget(manager) === 2)
-    assert(maxNumExecutorsNeeded(manager) == 2)
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 2)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2)
     post(SparkListenerTaskStart(0, 0,
       createTaskInfo(50, 39, executorId = "12", speculative = true)))
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 1)
-    assert(maxNumExecutorsNeeded(manager) == 1)
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 1)
 
     // Task 39 and 48 succeed, task 50 killed
     post(SparkListenerTaskEnd(0, 0, null, Success,
@@ -391,11 +618,11 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     post(SparkListenerStageCompleted(stage))
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 0)
-    assert(maxNumExecutorsNeeded(manager) == 0)
-    assert(removeExecutor(manager, "9"))
+    assert(numExecutorsTarget(manager, defaultProfile.id) === 0)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 0)
+    assert(removeExecutorDefaultProfile(manager, "9"))
     onExecutorRemoved(manager, "9")
-    assert(removeExecutor(manager, "12"))
+    assert(removeExecutorDefaultProfile(manager, "12"))
     onExecutorRemoved(manager, "12")
   }
 
@@ -417,43 +644,49 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     post(SparkListenerStageCompleted(stage))
 
     // There are still two tasks that belong to the zombie stage running.
-    assert(totalRunningTasks(manager) === 2)
+    assert(totalRunningTasksPerResourceProfile(manager) === 2)
 
     // submit another attempt for the stage.  We count completions from the first zombie attempt
     val stageAttempt1 = createStageInfo(stage.stageId, 5, attemptId = 1)
     post(SparkListenerStageSubmitted(stageAttempt1))
     post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo1, new ExecutorMetrics, null))
-    assert(totalRunningTasks(manager) === 1)
+    assert(totalRunningTasksPerResourceProfile(manager) === 1)
     val attemptTaskInfo1 = createTaskInfo(3, 0, "executor-1")
     val attemptTaskInfo2 = createTaskInfo(4, 1, "executor-1")
     post(SparkListenerTaskStart(0, 1, attemptTaskInfo1))
     post(SparkListenerTaskStart(0, 1, attemptTaskInfo2))
-    assert(totalRunningTasks(manager) === 3)
+    assert(totalRunningTasksPerResourceProfile(manager) === 3)
     post(SparkListenerTaskEnd(0, 1, null, Success, attemptTaskInfo1, new ExecutorMetrics, null))
-    assert(totalRunningTasks(manager) === 2)
+    assert(totalRunningTasksPerResourceProfile(manager) === 2)
     post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo2, new ExecutorMetrics, null))
-    assert(totalRunningTasks(manager) === 1)
+    assert(totalRunningTasksPerResourceProfile(manager) === 1)
     post(SparkListenerTaskEnd(0, 1, null, Success, attemptTaskInfo2, new ExecutorMetrics, null))
-    assert(totalRunningTasks(manager) === 0)
+    assert(totalRunningTasksPerResourceProfile(manager) === 0)
   }
 
   testRetry("cancel pending executors when no longer needed") {
     val manager = createManager(createConf(0, 10, 0))
     post(SparkListenerStageSubmitted(createStageInfo(2, 5)))
 
-    assert(numExecutorsTarget(manager) === 0)
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 3)
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
 
     val task1Info = createTaskInfo(0, 0, "executor-1")
     post(SparkListenerTaskStart(2, 0, task1Info))
 
-    assert(numExecutorsToAdd(manager) === 4)
-    assert(addExecutors(manager) === 2)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
 
     val task2Info = createTaskInfo(1, 0, "executor-1")
     post(SparkListenerTaskStart(2, 0, task2Info))
@@ -469,22 +702,21 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
 
   test("remove executors") {
     val manager = createManager(createConf(5, 10, 5))
-    (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) }
+    (1 to 10).map(_.toString).foreach { id => onExecutorAddedDefaultProfile(manager, id) }
 
     // Keep removing until the limit is reached
     assert(executorsPendingToRemove(manager).isEmpty)
-    assert(removeExecutor(manager, "1"))
+    assert(removeExecutorDefaultProfile(manager, "1"))
     assert(executorsPendingToRemove(manager).size === 1)
     assert(executorsPendingToRemove(manager).contains("1"))
-    assert(removeExecutor(manager, "2"))
-    assert(removeExecutor(manager, "3"))
+    assert(removeExecutorDefaultProfile(manager, "2"))
+    assert(removeExecutorDefaultProfile(manager, "3"))
     assert(executorsPendingToRemove(manager).size === 3)
     assert(executorsPendingToRemove(manager).contains("2"))
     assert(executorsPendingToRemove(manager).contains("3"))
-    assert(executorsPendingToRemove(manager).size === 3)
-    assert(removeExecutor(manager, "4"))
-    assert(removeExecutor(manager, "5"))
-    assert(!removeExecutor(manager, "6")) // reached the limit of 5
+    assert(removeExecutorDefaultProfile(manager, "4"))
+    assert(removeExecutorDefaultProfile(manager, "5"))
+    assert(!removeExecutorDefaultProfile(manager, "6")) // reached the limit of 5
     assert(executorsPendingToRemove(manager).size === 5)
     assert(executorsPendingToRemove(manager).contains("4"))
     assert(executorsPendingToRemove(manager).contains("5"))
@@ -508,29 +740,29 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
 
     // Try removing again
     // This should still fail because the number pending + running is still at the limit
-    assert(!removeExecutor(manager, "7"))
+    assert(!removeExecutorDefaultProfile(manager, "7"))
     assert(executorsPendingToRemove(manager).isEmpty)
-    assert(!removeExecutor(manager, "8"))
+    assert(!removeExecutorDefaultProfile(manager, "8"))
     assert(executorsPendingToRemove(manager).isEmpty)
   }
 
   test("remove multiple executors") {
     val manager = createManager(createConf(5, 10, 5))
-    (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) }
+    (1 to 10).map(_.toString).foreach { id => onExecutorAddedDefaultProfile(manager, id) }
 
     // Keep removing until the limit is reached
     assert(executorsPendingToRemove(manager).isEmpty)
-    assert(removeExecutors(manager, Seq("1")) === Seq("1"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("1")) === Seq("1"))
     assert(executorsPendingToRemove(manager).size === 1)
     assert(executorsPendingToRemove(manager).contains("1"))
-    assert(removeExecutors(manager, Seq("2", "3")) === Seq("2", "3"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("2", "3")) === Seq("2", "3"))
     assert(executorsPendingToRemove(manager).size === 3)
     assert(executorsPendingToRemove(manager).contains("2"))
     assert(executorsPendingToRemove(manager).contains("3"))
     assert(executorsPendingToRemove(manager).size === 3)
-    assert(removeExecutor(manager, "4"))
-    assert(removeExecutors(manager, Seq("5")) === Seq("5"))
-    assert(!removeExecutor(manager, "6")) // reached the limit of 5
+    assert(removeExecutorDefaultProfile(manager, "4"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("5")) === Seq("5"))
+    assert(!removeExecutorDefaultProfile(manager, "6")) // reached the limit of 5
     assert(executorsPendingToRemove(manager).size === 5)
     assert(executorsPendingToRemove(manager).contains("4"))
     assert(executorsPendingToRemove(manager).contains("5"))
@@ -554,87 +786,100 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
 
     // Try removing again
     // This should still fail because the number pending + running is still at the limit
-    assert(!removeExecutor(manager, "7"))
+    assert(!removeExecutorDefaultProfile(manager, "7"))
     assert(executorsPendingToRemove(manager).isEmpty)
-    assert(removeExecutors(manager, Seq("8")) !== Seq("8"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("8")) !== Seq("8"))
     assert(executorsPendingToRemove(manager).isEmpty)
   }
 
-  test ("Removing with various numExecutorsTarget condition") {
+  test ("Removing with various numExecutorsTargetForDefaultProfileId condition") {
     val manager = createManager(createConf(5, 12, 5))
 
     post(SparkListenerStageSubmitted(createStageInfo(0, 8)))
 
-    // Remove when numExecutorsTarget is the same as the current number of executors
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 2)
-    (1 to 8).foreach(execId => onExecutorAdded(manager, execId.toString))
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
+    // Remove when numExecutorsTargetForDefaultProfileId is the same as the current
+    // number of executors
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    (1 to 8).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString))
     (1 to 8).map { i => createTaskInfo(i, i, s"$i") }.foreach {
       info => post(SparkListenerTaskStart(0, 0, info)) }
     assert(manager.executorMonitor.executorCount === 8)
-    assert(numExecutorsTarget(manager) === 8)
-    assert(maxNumExecutorsNeeded(manager) == 8)
-    assert(!removeExecutor(manager, "1")) // won't work since numExecutorsTarget == numExecutors
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 8)
+    // won't work since numExecutorsTargetForDefaultProfileId == numExecutors
+    assert(!removeExecutorDefaultProfile(manager, "1"))
 
-    // Remove executors when numExecutorsTarget is lower than current number of executors
+    // Remove executors when numExecutorsTargetForDefaultProfileId is lower than
+    // current number of executors
     (1 to 3).map { i => createTaskInfo(i, i, s"$i") }.foreach { info =>
       post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null))
     }
     adjustRequestedExecutors(manager)
     assert(manager.executorMonitor.executorCount === 8)
-    assert(numExecutorsTarget(manager) === 5)
-    assert(maxNumExecutorsNeeded(manager) == 5)
-    assert(removeExecutor(manager, "1"))
-    assert(removeExecutors(manager, Seq("2", "3"))=== Seq("2", "3"))
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5)
+    assert(removeExecutorDefaultProfile(manager, "1"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("2", "3"))=== Seq("2", "3"))
     onExecutorRemoved(manager, "1")
     onExecutorRemoved(manager, "2")
     onExecutorRemoved(manager, "3")
 
-    // numExecutorsTarget is lower than minNumExecutors
+    // numExecutorsTargetForDefaultProfileId is lower than minNumExecutors
     post(SparkListenerTaskEnd(0, 0, null, Success, createTaskInfo(4, 4, "4"),
       new ExecutorMetrics, null))
     assert(manager.executorMonitor.executorCount === 5)
-    assert(numExecutorsTarget(manager) === 5)
-    assert(maxNumExecutorsNeeded(manager) == 4)
-    assert(!removeExecutor(manager, "4")) // lower limit
-    assert(addExecutors(manager) === 0) // upper limit
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 4)
+    assert(!removeExecutorDefaultProfile(manager, "4")) // lower limit
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) // upper limit
   }
 
   test ("interleaving add and remove") {
     val manager = createManager(createConf(5, 12, 5))
     post(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
     // Add a few executors
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 2)
-    onExecutorAdded(manager, "1")
-    onExecutorAdded(manager, "2")
-    onExecutorAdded(manager, "3")
-    onExecutorAdded(manager, "4")
-    onExecutorAdded(manager, "5")
-    onExecutorAdded(manager, "6")
-    onExecutorAdded(manager, "7")
-    onExecutorAdded(manager, "8")
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    onExecutorAddedDefaultProfile(manager, "1")
+    onExecutorAddedDefaultProfile(manager, "2")
+    onExecutorAddedDefaultProfile(manager, "3")
+    onExecutorAddedDefaultProfile(manager, "4")
+    onExecutorAddedDefaultProfile(manager, "5")
+    onExecutorAddedDefaultProfile(manager, "6")
+    onExecutorAddedDefaultProfile(manager, "7")
+    onExecutorAddedDefaultProfile(manager, "8")
     assert(manager.executorMonitor.executorCount === 8)
-    assert(numExecutorsTarget(manager) === 8)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
 
 
     // Remove when numTargetExecutors is equal to the current number of executors
-    assert(!removeExecutor(manager, "1"))
-    assert(removeExecutors(manager, Seq("2", "3")) !== Seq("2", "3"))
+    assert(!removeExecutorDefaultProfile(manager, "1"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("2", "3")) !== Seq("2", "3"))
 
     // Remove until limit
-    onExecutorAdded(manager, "9")
-    onExecutorAdded(manager, "10")
-    onExecutorAdded(manager, "11")
-    onExecutorAdded(manager, "12")
+    onExecutorAddedDefaultProfile(manager, "9")
+    onExecutorAddedDefaultProfile(manager, "10")
+    onExecutorAddedDefaultProfile(manager, "11")
+    onExecutorAddedDefaultProfile(manager, "12")
     assert(manager.executorMonitor.executorCount === 12)
-    assert(numExecutorsTarget(manager) === 8)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 8)
 
-    assert(removeExecutor(manager, "1"))
-    assert(removeExecutors(manager, Seq("2", "3", "4")) === Seq("2", "3", "4"))
-    assert(!removeExecutor(manager, "5")) // lower limit reached
-    assert(!removeExecutor(manager, "6"))
+    assert(removeExecutorDefaultProfile(manager, "1"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("2", "3", "4")) === Seq("2", "3", "4"))
+    assert(!removeExecutorDefaultProfile(manager, "5")) // lower limit reached
+    assert(!removeExecutorDefaultProfile(manager, "6"))
     onExecutorRemoved(manager, "1")
     onExecutorRemoved(manager, "2")
     onExecutorRemoved(manager, "3")
@@ -642,33 +887,36 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     assert(manager.executorMonitor.executorCount === 8)
 
     // Add until limit
-    assert(!removeExecutor(manager, "7")) // still at lower limit
+    assert(!removeExecutorDefaultProfile(manager, "7")) // still at lower limit
     assert((manager, Seq("8")) !== Seq("8"))
-    onExecutorAdded(manager, "13")
-    onExecutorAdded(manager, "14")
-    onExecutorAdded(manager, "15")
-    onExecutorAdded(manager, "16")
+    onExecutorAddedDefaultProfile(manager, "13")
+    onExecutorAddedDefaultProfile(manager, "14")
+    onExecutorAddedDefaultProfile(manager, "15")
+    onExecutorAddedDefaultProfile(manager, "16")
     assert(manager.executorMonitor.executorCount === 12)
 
     // Remove succeeds again, now that we are no longer at the lower limit
-    assert(removeExecutors(manager, Seq("5", "6", "7")) === Seq("5", "6", "7"))
-    assert(removeExecutor(manager, "8"))
+    assert(removeExecutorsDefaultProfile(manager, Seq("5", "6", "7")) === Seq("5", "6", "7"))
+    assert(removeExecutorDefaultProfile(manager, "8"))
     assert(manager.executorMonitor.executorCount === 12)
     onExecutorRemoved(manager, "5")
     onExecutorRemoved(manager, "6")
     assert(manager.executorMonitor.executorCount === 10)
-    assert(numExecutorsToAdd(manager) === 4)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 4)
     onExecutorRemoved(manager, "9")
     onExecutorRemoved(manager, "10")
-    assert(addExecutors(manager) === 4) // at upper limit
-    onExecutorAdded(manager, "17")
-    onExecutorAdded(manager, "18")
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) // at upper limit
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    onExecutorAddedDefaultProfile(manager, "17")
+    onExecutorAddedDefaultProfile(manager, "18")
     assert(manager.executorMonitor.executorCount === 10)
-    assert(addExecutors(manager) === 0) // still at upper limit
-    onExecutorAdded(manager, "19")
-    onExecutorAdded(manager, "20")
+    // still at upper limit
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    onExecutorAddedDefaultProfile(manager, "19")
+    onExecutorAddedDefaultProfile(manager, "20")
     assert(manager.executorMonitor.executorCount === 12)
-    assert(numExecutorsTarget(manager) === 12)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 12)
   }
 
   test("starting/canceling add timer") {
@@ -706,22 +954,22 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val manager = createManager(createConf(0, 20, 0), clock = clock)
 
     // No events - we should not be adding or removing
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
     clock.advance(100L)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
     clock.advance(1000L)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
     clock.advance(10000L)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
   }
 
@@ -734,43 +982,43 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     onSchedulerBacklogged(manager)
     clock.advance(schedulerBacklogTimeout * 1000 / 2)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 0) // timer not exceeded yet
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0) // timer not exceeded yet
     clock.advance(schedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 1) // first timer exceeded
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1) // first timer exceeded
     clock.advance(sustainedSchedulerBacklogTimeout * 1000 / 2)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 1) // second timer not exceeded yet
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1) // second timer not exceeded yet
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 1 + 2) // second timer exceeded
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1 + 2) // second timer exceeded
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 1 + 2 + 4) // third timer exceeded
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1 + 2 + 4) // third timer exceeded
 
     // Scheduler queue drained
     onSchedulerQueueEmpty(manager)
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 7) // timer is canceled
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 7) // timer is canceled
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 7)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 7)
 
     // Scheduler queue backlogged again
     onSchedulerBacklogged(manager)
     clock.advance(schedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 7 + 1) // timer restarted
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1) // timer restarted
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 7 + 1 + 2)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1 + 2)
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 7 + 1 + 2 + 4)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1 + 2 + 4)
     clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
-    assert(numExecutorsTarget(manager) === 20) // limit reached
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 20) // limit reached
   }
 
   test("mock polling loop remove behavior") {
@@ -778,9 +1026,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val manager = createManager(createConf(1, 20, 1), clock = clock)
 
     // Remove idle executors on timeout
-    onExecutorAdded(manager, "executor-1")
-    onExecutorAdded(manager, "executor-2")
-    onExecutorAdded(manager, "executor-3")
+    onExecutorAddedDefaultProfile(manager, "executor-1")
+    onExecutorAddedDefaultProfile(manager, "executor-2")
+    onExecutorAddedDefaultProfile(manager, "executor-3")
     assert(executorsPendingToRemove(manager).isEmpty)
 
     // idle threshold not reached yet
@@ -796,10 +1044,10 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     assert(executorsPendingToRemove(manager).size === 2) // limit reached (1 executor remaining)
 
     // Mark a subset as busy - only idle executors should be removed
-    onExecutorAdded(manager, "executor-4")
-    onExecutorAdded(manager, "executor-5")
-    onExecutorAdded(manager, "executor-6")
-    onExecutorAdded(manager, "executor-7")
+    onExecutorAddedDefaultProfile(manager, "executor-4")
+    onExecutorAddedDefaultProfile(manager, "executor-5")
+    onExecutorAddedDefaultProfile(manager, "executor-6")
+    onExecutorAddedDefaultProfile(manager, "executor-7")
     assert(manager.executorMonitor.executorCount === 7)
     assert(executorsPendingToRemove(manager).size === 2) // 2 pending to be removed
     onExecutorBusy(manager, "executor-4")
@@ -864,23 +1112,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val stage1 = createStageInfo(0, 1000)
     post(SparkListenerStageSubmitted(stage1))
 
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 2)
-    assert(addExecutors(manager) === 4)
-    assert(addExecutors(manager) === 8)
-    assert(numExecutorsTarget(manager) === 15)
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
+
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 8)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 15)
     (0 until 15).foreach { i =>
-      onExecutorAdded(manager, s"executor-$i")
+      onExecutorAddedDefaultProfile(manager, s"executor-$i")
     }
     assert(manager.executorMonitor.executorCount === 15)
     post(SparkListenerStageCompleted(stage1))
 
     adjustRequestedExecutors(manager)
-    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 0)
 
     post(SparkListenerStageSubmitted(createStageInfo(1, 1000)))
-    addExecutors(manager)
-    assert(numExecutorsTarget(manager) === 16)
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 16)
   }
 
   test("avoid ramp down initial executors until first job is submitted") {
@@ -888,19 +1144,19 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val manager = createManager(createConf(2, 5, 3), clock = clock)
 
     // Verify the initial number of executors
-    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
     schedule(manager)
     // Verify whether the initial number of executors is kept with no pending tasks
-    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
 
     post(SparkListenerStageSubmitted(createStageInfo(1, 2)))
     clock.advance(100L)
 
-    assert(maxNumExecutorsNeeded(manager) === 2)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 2)
     schedule(manager)
 
     // Verify that current number of executors should be ramp down when first job is submitted
-    assert(numExecutorsTarget(manager) === 2)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
   }
 
   test("avoid ramp down initial executors until idle executor is timeout") {
@@ -908,20 +1164,20 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val manager = createManager(createConf(2, 5, 3), clock = clock)
 
     // Verify the initial number of executors
-    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
     schedule(manager)
     // Verify the initial number of executors is kept when no pending tasks
-    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
     (0 until 3).foreach { i =>
-      onExecutorAdded(manager, s"executor-$i")
+      onExecutorAddedDefaultProfile(manager, s"executor-$i")
     }
 
     clock.advance(executorIdleTimeout * 1000)
 
-    assert(maxNumExecutorsNeeded(manager) === 0)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 0)
     schedule(manager)
-    // Verify executor is timeout,numExecutorsTarget is recalculated
-    assert(numExecutorsTarget(manager) === 2)
+    // Verify executor is timeout,numExecutorsTargetForDefaultProfileId is recalculated
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
   }
 
   test("get pending task number and related locality preference") {
@@ -937,7 +1193,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val stageInfo1 = createStageInfo(1, 5, localityPreferences1)
     post(SparkListenerStageSubmitted(stageInfo1))
 
-    assert(localityAwareTasks(manager) === 3)
+    assert(localityAwareTasksForDefaultProfile(manager) === 3)
+    val hostToLocal = hostToLocalTaskCount(manager)
     assert(hostToLocalTaskCount(manager) ===
       Map("host1" -> 2, "host2" -> 3, "host3" -> 2, "host4" -> 2))
 
@@ -949,67 +1206,76 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     val stageInfo2 = createStageInfo(2, 3, localityPreferences2)
     post(SparkListenerStageSubmitted(stageInfo2))
 
-    assert(localityAwareTasks(manager) === 5)
+    assert(localityAwareTasksForDefaultProfile(manager) === 5)
     assert(hostToLocalTaskCount(manager) ===
       Map("host1" -> 2, "host2" -> 4, "host3" -> 4, "host4" -> 3, "host5" -> 2))
 
     post(SparkListenerStageCompleted(stageInfo1))
-    assert(localityAwareTasks(manager) === 2)
+    assert(localityAwareTasksForDefaultProfile(manager) === 2)
     assert(hostToLocalTaskCount(manager) ===
       Map("host2" -> 1, "host3" -> 2, "host4" -> 1, "host5" -> 2))
   }
 
-  test("SPARK-8366: maxNumExecutorsNeeded should properly handle failed tasks") {
+  test("SPARK-8366: maxNumExecutorsNeededPerResourceProfile should properly handle failed tasks") {
     val manager = createManager(createConf())
-    assert(maxNumExecutorsNeeded(manager) === 0)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 0)
 
     post(SparkListenerStageSubmitted(createStageInfo(0, 1)))
-    assert(maxNumExecutorsNeeded(manager) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1)
 
     val taskInfo = createTaskInfo(1, 1, "executor-1")
     post(SparkListenerTaskStart(0, 0, taskInfo))
-    assert(maxNumExecutorsNeeded(manager) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1)
 
     // If the task is failed, we expect it to be resubmitted later.
     val taskEndReason = ExceptionFailure(null, null, null, null, None)
     post(SparkListenerTaskEnd(0, 0, null, taskEndReason, taskInfo, new ExecutorMetrics, null))
-    assert(maxNumExecutorsNeeded(manager) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1)
   }
 
   test("reset the state of allocation manager") {
     val manager = createManager(createConf())
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
+
+    val updatesNeeded =
+      new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates]
 
     // Allocation manager is reset when adding executor requests are sent without reporting back
     // executor added.
     post(SparkListenerStageSubmitted(createStageInfo(0, 10)))
 
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 2)
-    assert(addExecutors(manager) === 2)
-    assert(numExecutorsTarget(manager) === 4)
-    assert(addExecutors(manager) === 1)
-    assert(numExecutorsTarget(manager) === 5)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 4)
+    assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
 
     manager.reset()
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
     assert(manager.executorMonitor.executorCount === 0)
 
     // Allocation manager is reset when executors are added.
     post(SparkListenerStageSubmitted(createStageInfo(0, 10)))
 
-    addExecutors(manager)
-    addExecutors(manager)
-    addExecutors(manager)
-    assert(numExecutorsTarget(manager) === 5)
-
-    onExecutorAdded(manager, "first")
-    onExecutorAdded(manager, "second")
-    onExecutorAdded(manager, "third")
-    onExecutorAdded(manager, "fourth")
-    onExecutorAdded(manager, "fifth")
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+
+    onExecutorAddedDefaultProfile(manager, "first")
+    onExecutorAddedDefaultProfile(manager, "second")
+    onExecutorAddedDefaultProfile(manager, "third")
+    onExecutorAddedDefaultProfile(manager, "fourth")
+    onExecutorAddedDefaultProfile(manager, "fifth")
     assert(manager.executorMonitor.executorCount === 5)
 
     // Cluster manager lost will make all the live executors lost, so here simulate this behavior
@@ -1020,28 +1286,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     onExecutorRemoved(manager, "fifth")
 
     manager.reset()
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
     assert(manager.executorMonitor.executorCount === 0)
 
     // Allocation manager is reset when executors are pending to remove
-    addExecutors(manager)
-    addExecutors(manager)
-    addExecutors(manager)
-    assert(numExecutorsTarget(manager) === 5)
-
-    onExecutorAdded(manager, "first")
-    onExecutorAdded(manager, "second")
-    onExecutorAdded(manager, "third")
-    onExecutorAdded(manager, "fourth")
-    onExecutorAdded(manager, "fifth")
-    onExecutorAdded(manager, "sixth")
-    onExecutorAdded(manager, "seventh")
-    onExecutorAdded(manager, "eighth")
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    addExecutorsToTargetForDefaultProfile(manager, updatesNeeded)
+    doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis())
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 5)
+
+    onExecutorAddedDefaultProfile(manager, "first")
+    onExecutorAddedDefaultProfile(manager, "second")
+    onExecutorAddedDefaultProfile(manager, "third")
+    onExecutorAddedDefaultProfile(manager, "fourth")
+    onExecutorAddedDefaultProfile(manager, "fifth")
+    onExecutorAddedDefaultProfile(manager, "sixth")
+    onExecutorAddedDefaultProfile(manager, "seventh")
+    onExecutorAddedDefaultProfile(manager, "eighth")
     assert(manager.executorMonitor.executorCount === 8)
 
-    removeExecutor(manager, "first")
-    removeExecutors(manager, Seq("second", "third"))
+    removeExecutorDefaultProfile(manager, "first")
+    removeExecutorsDefaultProfile(manager, Seq("second", "third"))
     assert(executorsPendingToRemove(manager) === Set("first", "second", "third"))
     assert(manager.executorMonitor.executorCount === 8)
 
@@ -1055,8 +1324,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
 
     manager.reset()
 
-    assert(numExecutorsTarget(manager) === 1)
-    assert(numExecutorsToAdd(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
+    assert(numExecutorsToAddForDefaultProfile(manager) === 1)
     assert(executorsPendingToRemove(manager) === Set.empty)
     assert(manager.executorMonitor.executorCount === 0)
   }
@@ -1067,31 +1336,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
       createConf(1, 2, 1).set(config.DYN_ALLOCATION_TESTING, false),
       clock = clock)
 
-    when(client.requestTotalExecutors(meq(2), any(), any())).thenReturn(true)
+    when(client.requestTotalExecutors(any(), any(), any())).thenReturn(true)
     // test setup -- job with 2 tasks, scale up to two executors
-    assert(numExecutorsTarget(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
     post(SparkListenerExecutorAdded(
       clock.getTimeMillis(), "executor-1", new ExecutorInfo("host1", 1, Map.empty, Map.empty)))
     post(SparkListenerStageSubmitted(createStageInfo(0, 2)))
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 2)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
     val taskInfo0 = createTaskInfo(0, 0, "executor-1")
     post(SparkListenerTaskStart(0, 0, taskInfo0))
     post(SparkListenerExecutorAdded(
       clock.getTimeMillis(), "executor-2", new ExecutorInfo("host1", 1, Map.empty, Map.empty)))
     val taskInfo1 = createTaskInfo(1, 1, "executor-2")
     post(SparkListenerTaskStart(0, 0, taskInfo1))
-    assert(numExecutorsTarget(manager) === 2)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
 
     // have one task finish -- we should adjust the target number of executors down
     // but we should *not* kill any executors yet
     post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo0, new ExecutorMetrics, null))
-    assert(maxNumExecutorsNeeded(manager) === 1)
-    assert(numExecutorsTarget(manager) === 2)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 2)
     clock.advance(1000)
     manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    assert(numExecutorsTarget(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
     verify(client, never).killExecutors(any(), any(), any(), any())
 
     // now we cross the idle timeout for executor-1, so we kill it.  the really important
@@ -1101,8 +1370,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
       .thenReturn(Seq("executor-1"))
     clock.advance(3000)
     schedule(manager)
-    assert(maxNumExecutorsNeeded(manager) === 1)
-    assert(numExecutorsTarget(manager) === 1)
+    assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
     // here's the important verify -- we did kill the executors, but did not adjust the target count
     verify(client).killExecutors(Seq("executor-1"), false, false, false)
   }
@@ -1110,7 +1379,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
   test("SPARK-26758 check executor target number after idle time out ") {
     val clock = new ManualClock(10000L)
     val manager = createManager(createConf(1, 5, 3), clock = clock)
-    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 3)
     post(SparkListenerExecutorAdded(
       clock.getTimeMillis(), "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     post(SparkListenerExecutorAdded(
@@ -1121,14 +1390,14 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     clock.advance(executorIdleTimeout * 1000)
     schedule(manager)
     // once the schedule is run target executor number should be 1
-    assert(numExecutorsTarget(manager) === 1)
+    assert(numExecutorsTargetForDefaultProfileId(manager) === 1)
   }
 
   private def createConf(
       minExecutors: Int = 1,
       maxExecutors: Int = 5,
       initialExecutors: Int = 1): SparkConf = {
-    new SparkConf()
+    val sparkConf = new SparkConf()
       .set(config.DYN_ALLOCATION_ENABLED, true)
       .set(config.DYN_ALLOCATION_MIN_EXECUTORS, minExecutors)
       .set(config.DYN_ALLOCATION_MAX_EXECUTORS, maxExecutors)
@@ -1143,12 +1412,16 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
       // SPARK-22864: effectively disable the allocation schedule by setting the period to a
       // really long value.
       .set(TEST_SCHEDULE_INTERVAL, 10000L)
+    sparkConf
   }
 
   private def createManager(
       conf: SparkConf,
       clock: Clock = new SystemClock()): ExecutorAllocationManager = {
-    val manager = new ExecutorAllocationManager(client, listenerBus, conf, clock = clock)
+    ResourceProfile.reInitDefaultProfile(conf)
+    rpManager = new ResourceProfileManager(conf)
+    val manager = new ExecutorAllocationManager(client, listenerBus, conf, clock = clock,
+      resourceProfileManager = rpManager)
     managers += manager
     manager.start()
     manager
@@ -1157,7 +1430,18 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
   private val execInfo = new ExecutorInfo("host1", 1, Map.empty,
     Map.empty, Map.empty, DEFAULT_RESOURCE_PROFILE_ID)
 
-  private def onExecutorAdded(manager: ExecutorAllocationManager, id: String): Unit = {
+  private def onExecutorAddedDefaultProfile(
+      manager: ExecutorAllocationManager,
+      id: String): Unit = {
+    post(SparkListenerExecutorAdded(0L, id, execInfo))
+  }
+
+  private def onExecutorAdded(
+      manager: ExecutorAllocationManager,
+      id: String,
+      rp: ResourceProfile): Unit = {
+    val cores = rp.getExecutorCores.getOrElse(1)
+    val execInfo = new ExecutorInfo("host1", cores, Map.empty, Map.empty, Map.empty, rp.id)
     post(SparkListenerExecutorAdded(0L, id, execInfo))
   }
 
@@ -1176,8 +1460,18 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     post(SparkListenerTaskEnd(1, 1, "foo", Success, info, new ExecutorMetrics, null))
   }
 
-  private def removeExecutor(manager: ExecutorAllocationManager, executorId: String): Boolean = {
-    val executorsRemoved = removeExecutors(manager, Seq(executorId))
+  private def removeExecutorDefaultProfile(
+      manager: ExecutorAllocationManager,
+      executorId: String): Boolean = {
+    val executorsRemoved = removeExecutorsDefaultProfile(manager, Seq(executorId))
+    executorsRemoved.nonEmpty && executorsRemoved(0) == executorId
+  }
+
+  private def removeExecutor(
+      manager: ExecutorAllocationManager,
+      executorId: String,
+      rpId: Int): Boolean = {
+    val executorsRemoved = removeExecutors(manager, Seq((executorId, rpId)))
     executorsRemoved.nonEmpty && executorsRemoved(0) == executorId
   }
 
@@ -1199,10 +1493,11 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
       stageId: Int,
       numTasks: Int,
       taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty,
-      attemptId: Int = 0
+      attemptId: Int = 0,
+      rp: ResourceProfile = defaultProfile
     ): StageInfo = {
     new StageInfo(stageId, attemptId, "name", numTasks, Seq.empty, Seq.empty, "no details",
-      taskLocalityPreferences = taskLocalityPreferences)
+      taskLocalityPreferences = taskLocalityPreferences, resourceProfileId = rp.id)
   }
 
   private def createTaskInfo(
@@ -1217,54 +1512,117 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
    | Helper methods for accessing private methods and fields |
    * ------------------------------------------------------- */
 
-  private val _numExecutorsToAdd = PrivateMethod[Int](Symbol("numExecutorsToAdd"))
-  private val _numExecutorsTarget = PrivateMethod[Int](Symbol("numExecutorsTarget"))
-  private val _maxNumExecutorsNeeded = PrivateMethod[Int](Symbol("maxNumExecutorsNeeded"))
+  private val _numExecutorsToAddPerResourceProfileId =
+    PrivateMethod[mutable.HashMap[Int, Int]](
+      Symbol("numExecutorsToAddPerResourceProfileId"))
+  private val _numExecutorsTargetPerResourceProfileId =
+    PrivateMethod[mutable.HashMap[Int, Int]](
+      Symbol("numExecutorsTargetPerResourceProfileId"))
+  private val _maxNumExecutorsNeededPerResourceProfile =
+    PrivateMethod[Int](Symbol("maxNumExecutorsNeededPerResourceProfile"))
   private val _addTime = PrivateMethod[Long](Symbol("addTime"))
   private val _schedule = PrivateMethod[Unit](Symbol("schedule"))
-  private val _addExecutors = PrivateMethod[Int](Symbol("addExecutors"))
+  private val _doUpdateRequest = PrivateMethod[Unit](Symbol("doUpdateRequest"))
   private val _updateAndSyncNumExecutorsTarget =
     PrivateMethod[Int](Symbol("updateAndSyncNumExecutorsTarget"))
+  private val _addExecutorsToTarget = PrivateMethod[Int](Symbol("addExecutorsToTarget"))
   private val _removeExecutors = PrivateMethod[Seq[String]](Symbol("removeExecutors"))
   private val _onSchedulerBacklogged = PrivateMethod[Unit](Symbol("onSchedulerBacklogged"))
   private val _onSchedulerQueueEmpty = PrivateMethod[Unit](Symbol("onSchedulerQueueEmpty"))
-  private val _localityAwareTasks = PrivateMethod[Int](Symbol("localityAwareTasks"))
-  private val _hostToLocalTaskCount =
-    PrivateMethod[Map[String, Int]](Symbol("hostToLocalTaskCount"))
+  private val _localityAwareTasksPerResourceProfileId =
+    PrivateMethod[mutable.HashMap[Int, Int]](Symbol("numLocalityAwareTasksPerResourceProfileId"))
+  private val _rpIdToHostToLocalTaskCount =
+    PrivateMethod[Map[Int, Map[String, Int]]](Symbol("rpIdToHostToLocalTaskCount"))
   private val _onSpeculativeTaskSubmitted =
     PrivateMethod[Unit](Symbol("onSpeculativeTaskSubmitted"))
-  private val _totalRunningTasks = PrivateMethod[Int](Symbol("totalRunningTasks"))
+  private val _totalRunningTasksPerResourceProfile =
+    PrivateMethod[Int](Symbol("totalRunningTasksPerResourceProfile"))
+
+  private val defaultProfile = ResourceProfile.getOrCreateDefaultProfile(new SparkConf)
+
+  private def numExecutorsToAddForDefaultProfile(manager: ExecutorAllocationManager): Int = {
+    numExecutorsToAdd(manager, defaultProfile)
+  }
+
+  private def numExecutorsToAdd(
+      manager: ExecutorAllocationManager,
+      rp: ResourceProfile): Int = {
+    val nmap = manager invokePrivate _numExecutorsToAddPerResourceProfileId()
+    nmap(rp.id)
+  }
+
+  private def updateAndSyncNumExecutorsTarget(
+      manager: ExecutorAllocationManager,
+      now: Long): Unit = {
+    manager invokePrivate _updateAndSyncNumExecutorsTarget(now)
+  }
+
+  private def numExecutorsTargetForDefaultProfileId(manager: ExecutorAllocationManager): Int = {
+    numExecutorsTarget(manager, defaultProfile.id)
+  }
 
-  private def numExecutorsToAdd(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _numExecutorsToAdd()
+  private def numExecutorsTarget(
+      manager: ExecutorAllocationManager,
+      rpId: Int): Int = {
+    val numMap = manager invokePrivate _numExecutorsTargetPerResourceProfileId()
+    numMap(rpId)
   }
 
-  private def numExecutorsTarget(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _numExecutorsTarget()
+  private def addExecutorsToTargetForDefaultProfile(
+      manager: ExecutorAllocationManager,
+      updatesNeeded: mutable.HashMap[ResourceProfile,
+        ExecutorAllocationManager.TargetNumUpdates]
+  ): Int = {
+    addExecutorsToTarget(manager, updatesNeeded, defaultProfile)
+  }
+
+  private def addExecutorsToTarget(
+      manager: ExecutorAllocationManager,
+      updatesNeeded: mutable.HashMap[ResourceProfile,
+        ExecutorAllocationManager.TargetNumUpdates],
+      rp: ResourceProfile
+  ): Int = {
+    val maxNumExecutorsNeeded =
+      manager invokePrivate _maxNumExecutorsNeededPerResourceProfile(rp.id)
+    manager invokePrivate
+      _addExecutorsToTarget(maxNumExecutorsNeeded, rp.id, updatesNeeded)
   }
 
   private def addTime(manager: ExecutorAllocationManager): Long = {
     manager invokePrivate _addTime()
   }
 
-  private def schedule(manager: ExecutorAllocationManager): Unit = {
-    manager invokePrivate _schedule()
+  private def doUpdateRequest(
+      manager: ExecutorAllocationManager,
+      updates: Map[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates],
+      now: Long): Unit = {
+    manager invokePrivate _doUpdateRequest(updates, now)
   }
 
-  private def maxNumExecutorsNeeded(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _maxNumExecutorsNeeded()
+  private def schedule(manager: ExecutorAllocationManager): Unit = {
+    manager invokePrivate _schedule()
   }
 
-  private def addExecutors(manager: ExecutorAllocationManager): Int = {
-    val maxNumExecutorsNeeded = manager invokePrivate _maxNumExecutorsNeeded()
-    manager invokePrivate _addExecutors(maxNumExecutorsNeeded)
+  private def maxNumExecutorsNeededPerResourceProfile(
+      manager: ExecutorAllocationManager,
+      rp: ResourceProfile): Int = {
+    manager invokePrivate _maxNumExecutorsNeededPerResourceProfile(rp.id)
   }
 
   private def adjustRequestedExecutors(manager: ExecutorAllocationManager): Int = {
     manager invokePrivate _updateAndSyncNumExecutorsTarget(0L)
   }
 
-  private def removeExecutors(manager: ExecutorAllocationManager, ids: Seq[String]): Seq[String] = {
+  private def removeExecutorsDefaultProfile(
+      manager: ExecutorAllocationManager,
+      ids: Seq[String]): Seq[String] = {
+    val idsAndProfileIds = ids.map((_, defaultProfile.id))
+    manager invokePrivate _removeExecutors(idsAndProfileIds)
+  }
+
+  private def removeExecutors(
+      manager: ExecutorAllocationManager,
+      ids: Seq[(String, Int)]): Seq[String] = {
     manager invokePrivate _removeExecutors(ids)
   }
 
@@ -1280,15 +1638,22 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
     manager invokePrivate _onSpeculativeTaskSubmitted(id)
   }
 
-  private def localityAwareTasks(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _localityAwareTasks()
+  private def localityAwareTasksForDefaultProfile(manager: ExecutorAllocationManager): Int = {
+    val localMap = manager invokePrivate _localityAwareTasksPerResourceProfileId()
+    localMap(defaultProfile.id)
+  }
+
+  private def totalRunningTasksPerResourceProfile(manager: ExecutorAllocationManager): Int = {
+    manager invokePrivate _totalRunningTasksPerResourceProfile(defaultProfile.id)
   }
 
-  private def totalRunningTasks(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _totalRunningTasks()
+  private def hostToLocalTaskCount(
+      manager: ExecutorAllocationManager): Map[String, Int] = {
+    val rpIdToHostLocal = manager invokePrivate _rpIdToHostToLocalTaskCount()
+    rpIdToHostLocal(defaultProfile.id)
   }
 
-  private def hostToLocalTaskCount(manager: ExecutorAllocationManager): Map[String, Int] = {
-    manager invokePrivate _hostToLocalTaskCount()
+  private def getResourceProfileIdOfExecutor(manager: ExecutorAllocationManager): Int = {
+    defaultProfile.id
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index ff0f2f9134ed3..a9296955d18b4 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -285,9 +285,14 @@ private class FakeSchedulerBackend(
     clusterManagerEndpoint: RpcEndpointRef)
   extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) {
 
-  protected override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
+  protected override def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = {
     clusterManagerEndpoint.ask[Boolean](
-      RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount, Set.empty))
+      RequestExecutors(
+        resourceProfileToTotalExecs(ResourceProfile.getOrCreateDefaultProfile(conf)),
+        numLocalityAwareTasksPerResourceProfileId(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+        rpHostToLocalTaskCount(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+        Set.empty))
   }
 
   protected override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
diff --git a/core/src/test/scala/org/apache/spark/LocalSparkContext.scala b/core/src/test/scala/org/apache/spark/LocalSparkContext.scala
index 1fe12e116d96e..599ea8955491f 100644
--- a/core/src/test/scala/org/apache/spark/LocalSparkContext.scala
+++ b/core/src/test/scala/org/apache/spark/LocalSparkContext.scala
@@ -44,7 +44,7 @@ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self
 
   def resetSparkContext(): Unit = {
     LocalSparkContext.stop(sc)
-    ResourceProfile.clearDefaultProfile
+    ResourceProfile.clearDefaultProfile()
     sc = null
   }
 
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index df9c7c5eaa368..b6dfa69015c28 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -36,6 +36,7 @@ import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.TestUtils._
 import org.apache.spark.internal.config._
+import org.apache.spark.internal.config.Tests._
 import org.apache.spark.internal.config.UI._
 import org.apache.spark.resource.ResourceAllocation
 import org.apache.spark.resource.ResourceUtils._
@@ -784,7 +785,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
   }
 
   test(s"Avoid setting ${CPUS_PER_TASK.key} unreasonably (SPARK-27192)") {
-    val FAIL_REASON = s"has to be >= the task config: ${CPUS_PER_TASK.key}"
+    val FAIL_REASON = " has to be >= the number of cpus per task"
     Seq(
       ("local", 2, None),
       ("local[2]", 3, None),
@@ -864,9 +865,8 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
       sc = new SparkContext(conf)
     }.getMessage()
 
-    assert(error.contains("The executor resource config: spark.executor.resource.gpu.amount " +
-      "needs to be specified since a task requirement config: spark.task.resource.gpu.amount " +
-      "was specified"))
+    assert(error.contains("No executor resource configs were not specified for the following " +
+      "task configs: gpu"))
   }
 
   test("Test parsing resources executor config < task requirements") {
@@ -880,15 +880,15 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
       sc = new SparkContext(conf)
     }.getMessage()
 
-    assert(error.contains("The executor resource config: spark.executor.resource.gpu.amount = 1 " +
-      "has to be >= the requested amount in task resource config: " +
-      "spark.task.resource.gpu.amount = 2"))
+    assert(error.contains("The executor resource: gpu, amount: 1 needs to be >= the task " +
+      "resource request amount of 2.0"))
   }
 
   test("Parse resources executor config not the same multiple numbers of the task requirements") {
     val conf = new SparkConf()
       .setMaster("local-cluster[1, 1, 1024]")
       .setAppName("test-cluster")
+    conf.set(RESOURCES_WARNING_TESTING, true)
     conf.set(TASK_GPU_ID.amountConf, "2")
     conf.set(EXECUTOR_GPU_ID.amountConf, "4")
 
@@ -897,25 +897,9 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
     }.getMessage()
 
     assert(error.contains(
-      "The configuration of resource: gpu (exec = 4, task = 2, runnable tasks = 2) will result " +
-        "in wasted resources due to resource CPU limiting the number of runnable tasks per " +
-        "executor to: 1. Please adjust your configuration."))
-  }
-
-  test("Parse resources executor config cpus not limiting resource") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[1, 8, 1024]")
-      .setAppName("test-cluster")
-    conf.set(TASK_GPU_ID.amountConf, "2")
-    conf.set(EXECUTOR_GPU_ID.amountConf, "4")
-
-    var error = intercept[IllegalArgumentException] {
-      sc = new SparkContext(conf)
-    }.getMessage()
-
-    assert(error.contains("The number of slots on an executor has to be " +
-      "limited by the number of cores, otherwise you waste resources and " +
-      "dynamic allocation doesn't work properly"))
+      "The configuration of resource: gpu (exec = 4, task = 2.0/1, runnable tasks = 2) will " +
+        "result in wasted resources due to resource cpus limiting the number of runnable " +
+        "tasks per executor to: 1. Please adjust your configuration."))
   }
 
   test("test resource scheduling under local-cluster mode") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala
index 86511ae08784a..c905797bf1287 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.history
 
 import org.apache.spark.{SparkFunSuite, Success, TaskResultLost, TaskState}
 import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.status.ListenerEventsTestHelper
 
@@ -141,7 +142,8 @@ class BasicEventFilterBuilderSuite extends SparkFunSuite {
     // - Re-submit stage 1, all tasks, and succeed them and the stage.
     val oldS1 = stages.last
     val newS1 = new StageInfo(oldS1.stageId, oldS1.attemptNumber + 1, oldS1.name, oldS1.numTasks,
-      oldS1.rddInfos, oldS1.parentIds, oldS1.details, oldS1.taskMetrics)
+      oldS1.rddInfos, oldS1.parentIds, oldS1.details, oldS1.taskMetrics,
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     time += 1
     newS1.submissionTime = Some(time)
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala
new file mode 100644
index 0000000000000..075260317284d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.resource
+
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.internal.config._
+import org.apache.spark.internal.config.Tests._
+
+class ResourceProfileManagerSuite extends SparkFunSuite {
+
+  override def beforeAll() {
+    try {
+      ResourceProfile.clearDefaultProfile()
+    } finally {
+      super.beforeAll()
+    }
+  }
+
+  override def afterEach() {
+    try {
+      ResourceProfile.clearDefaultProfile()
+    } finally {
+      super.afterEach()
+    }
+  }
+
+  test("ResourceProfileManager") {
+    val conf = new SparkConf().set(EXECUTOR_CORES, 4)
+    val rpmanager = new ResourceProfileManager(conf)
+    val defaultProf = rpmanager.defaultResourceProfile
+    assert(defaultProf.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    assert(defaultProf.executorResources.size === 2,
+      "Executor resources should contain cores and memory by default")
+    assert(defaultProf.executorResources(ResourceProfile.CORES).amount === 4,
+      s"Executor resources should have 4 cores")
+  }
+
+  test("isSupported yarn no dynamic allocation") {
+    val conf = new SparkConf().setMaster("yarn").set(EXECUTOR_CORES, 4)
+    conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true")
+    val rpmanager = new ResourceProfileManager(conf)
+    // default profile should always work
+    val defaultProf = rpmanager.defaultResourceProfile
+    val rprof = new ResourceProfileBuilder()
+    val gpuExecReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "someScript")
+    val immrprof = rprof.require(gpuExecReq).build
+    val error = intercept[SparkException] {
+      rpmanager.isSupported(immrprof)
+    }.getMessage()
+
+    assert(error.contains("ResourceProfiles are only supported on YARN with dynamic allocation"))
+  }
+
+  test("isSupported yarn with dynamic allocation") {
+    val conf = new SparkConf().setMaster("yarn").set(EXECUTOR_CORES, 4)
+    conf.set(DYN_ALLOCATION_ENABLED, true)
+    conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true")
+    val rpmanager = new ResourceProfileManager(conf)
+    // default profile should always work
+    val defaultProf = rpmanager.defaultResourceProfile
+    val rprof = new ResourceProfileBuilder()
+    val gpuExecReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "someScript")
+    val immrprof = rprof.require(gpuExecReq).build
+    assert(rpmanager.isSupported(immrprof) == true)
+  }
+
+  test("isSupported yarn with local mode") {
+    val conf = new SparkConf().setMaster("local").set(EXECUTOR_CORES, 4)
+    conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true")
+    val rpmanager = new ResourceProfileManager(conf)
+    // default profile should always work
+    val defaultProf = rpmanager.defaultResourceProfile
+    val rprof = new ResourceProfileBuilder()
+    val gpuExecReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "someScript")
+    val immrprof = rprof.require(gpuExecReq).build
+    var error = intercept[SparkException] {
+      rpmanager.isSupported(immrprof)
+    }.getMessage()
+
+    assert(error.contains("ResourceProfiles are only supported on YARN with dynamic allocation"))
+  }
+
+
+
+}
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
index c0637eeeacaba..b2f2c3632e454 100644
--- a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
@@ -18,18 +18,28 @@
 package org.apache.spark.resource
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.internal.config.{EXECUTOR_CORES, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD, SPARK_EXECUTOR_PREFIX}
+import org.apache.spark.internal.config.{EXECUTOR_CORES, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD}
 import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY
+import org.apache.spark.resource.TestResourceIDs._
 
 class ResourceProfileSuite extends SparkFunSuite {
 
+  override def beforeAll() {
+    try {
+      ResourceProfile.clearDefaultProfile()
+    } finally {
+      super.beforeAll()
+    }
+  }
+
   override def afterEach() {
     try {
-      ResourceProfile.clearDefaultProfile
+      ResourceProfile.clearDefaultProfile()
     } finally {
       super.afterEach()
     }
   }
+
   test("Default ResourceProfile") {
     val rprof = ResourceProfile.getOrCreateDefaultProfile(new SparkConf)
     assert(rprof.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
@@ -59,18 +69,19 @@ class ResourceProfileSuite extends SparkFunSuite {
     conf.set(EXECUTOR_MEMORY_OVERHEAD.key, "1g")
     conf.set(EXECUTOR_MEMORY.key, "4g")
     conf.set(EXECUTOR_CORES.key, "4")
-    conf.set("spark.task.resource.gpu.amount", "1")
-    conf.set(s"$SPARK_EXECUTOR_PREFIX.resource.gpu.amount", "1")
-    conf.set(s"$SPARK_EXECUTOR_PREFIX.resource.gpu.discoveryScript", "nameOfScript")
+    conf.set(TASK_GPU_ID.amountConf, "1")
+    conf.set(EXECUTOR_GPU_ID.amountConf, "1")
+    conf.set(EXECUTOR_GPU_ID.discoveryScriptConf, "nameOfScript")
     val rprof = ResourceProfile.getOrCreateDefaultProfile(conf)
     assert(rprof.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     val execResources = rprof.executorResources
-    assert(execResources.size === 5,
-      "Executor resources should contain cores, memory, and gpu " + execResources)
+    assert(execResources.size === 5, s"Executor resources should contain cores, pyspark " +
+      s"memory, memory overhead, memory, and gpu $execResources")
     assert(execResources.contains("gpu"), "Executor resources should have gpu")
     assert(rprof.executorResources(ResourceProfile.CORES).amount === 4,
       "Executor resources should have 4 core")
-    assert(rprof.getExecutorCores.get === 4, "Executor resources should have 4 core")
+    assert(rprof.getExecutorCores.get === 4,
+      "Executor resources should have 4 core")
     assert(rprof.executorResources(ResourceProfile.MEMORY).amount === 4096,
       "Executor resources should have 1024 memory")
     assert(rprof.executorResources(ResourceProfile.PYSPARK_MEM).amount == 2048,
@@ -84,12 +95,60 @@ class ResourceProfileSuite extends SparkFunSuite {
 
   test("test default profile task gpus fractional") {
     val sparkConf = new SparkConf()
-      .set("spark.executor.resource.gpu.amount", "2")
-      .set("spark.task.resource.gpu.amount", "0.33")
+      .set(EXECUTOR_GPU_ID.amountConf, "2")
+      .set(TASK_GPU_ID.amountConf, "0.33")
     val immrprof = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
     assert(immrprof.taskResources.get("gpu").get.amount == 0.33)
   }
 
+  test("maxTasksPerExecutor cpus") {
+    val sparkConf = new SparkConf()
+      .set(EXECUTOR_CORES, 1)
+    val rprof = new ResourceProfileBuilder()
+    val taskReq = new TaskResourceRequests().resource("gpu", 1)
+    val execReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia")
+    rprof.require(taskReq).require(execReq)
+    val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources)
+    assert(immrprof.limitingResource(sparkConf) == "cpus")
+    assert(immrprof.maxTasksPerExecutor(sparkConf) == 1)
+  }
+
+  test("maxTasksPerExecutor/limiting no executor cores") {
+    val sparkConf = new SparkConf().setMaster("spark://testing")
+    val rprof = new ResourceProfileBuilder()
+    val taskReq = new TaskResourceRequests().resource("gpu", 1)
+    val execReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia")
+    rprof.require(taskReq).require(execReq)
+    val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources)
+    assert(immrprof.limitingResource(sparkConf) == "gpu")
+    assert(immrprof.maxTasksPerExecutor(sparkConf) == 2)
+    assert(immrprof.isCoresLimitKnown == false)
+  }
+
+  test("maxTasksPerExecutor/limiting no other resource no executor cores") {
+    val sparkConf = new SparkConf().setMaster("spark://testing")
+    val immrprof = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+    assert(immrprof.limitingResource(sparkConf) == "")
+    assert(immrprof.maxTasksPerExecutor(sparkConf) == 1)
+    assert(immrprof.isCoresLimitKnown == false)
+  }
+
+  test("maxTasksPerExecutor/limiting executor cores") {
+    val sparkConf = new SparkConf().setMaster("spark://testing").set(EXECUTOR_CORES, 2)
+    val rprof = new ResourceProfileBuilder()
+    val taskReq = new TaskResourceRequests().resource("gpu", 1)
+    val execReq =
+      new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia")
+    rprof.require(taskReq).require(execReq)
+    val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources)
+    assert(immrprof.limitingResource(sparkConf) == ResourceProfile.CPUS)
+    assert(immrprof.maxTasksPerExecutor(sparkConf) == 2)
+    assert(immrprof.isCoresLimitKnown == true)
+  }
+
+
   test("Create ResourceProfile") {
     val rprof = new ResourceProfileBuilder()
     val taskReq = new TaskResourceRequests().resource("gpu", 1)
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
index dffe9a02e9aa4..278a72a7192d8 100644
--- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
@@ -26,8 +26,10 @@ import org.json4s.{DefaultFormats, Extraction}
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.TestUtils._
 import org.apache.spark.internal.config._
+import org.apache.spark.internal.config.Tests._
 import org.apache.spark.resource.ResourceUtils._
 import org.apache.spark.resource.TestResourceIDs._
+import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.util.Utils
 
 class ResourceUtilsSuite extends SparkFunSuite
@@ -165,6 +167,7 @@ class ResourceUtilsSuite extends SparkFunSuite
       val rpBuilder = new ResourceProfileBuilder()
       val ereqs = new ExecutorResourceRequests().resource(GPU, 2, gpuDiscovery)
       val treqs = new TaskResourceRequests().resource(GPU, 1)
+
       val rp = rpBuilder.require(ereqs).require(treqs).build
       val resourcesFromBoth = getOrDiscoverAllResourcesForResourceProfile(
         Some(resourcesFile), SPARK_EXECUTOR_PREFIX, rp, conf)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
index c063301673598..7666c6c7810cc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark._
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Network.RPC_MESSAGE_MAX_SIZE
 import org.apache.spark.rdd.RDD
-import org.apache.spark.resource.{ResourceInformation, ResourceProfile}
+import org.apache.spark.resource.{ExecutorResourceRequests, ResourceInformation, ResourceProfile, TaskResourceRequests}
 import org.apache.spark.resource.ResourceUtils._
 import org.apache.spark.resource.TestResourceIDs._
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv}
@@ -187,8 +187,6 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
   }
 
   test("extra resources from executor") {
-    import TestUtils._
-
     val conf = new SparkConf()
       .set(EXECUTOR_CORES, 1)
       .set(SCHEDULER_REVIVE_INTERVAL.key, "1m") // don't let it auto revive during test
@@ -200,6 +198,11 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
     conf.set(EXECUTOR_GPU_ID.amountConf, "1")
 
     sc = new SparkContext(conf)
+    val execGpu = new ExecutorResourceRequests().cores(1).resource(GPU, 3)
+    val taskGpu = new TaskResourceRequests().cpus(1).resource(GPU, 1)
+    val rp = new ResourceProfile(execGpu.requests, taskGpu.requests)
+    sc.resourceProfileManager.addResourceProfile(rp)
+    assert(rp.id > ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     val backend = sc.schedulerBackend.asInstanceOf[TestCoarseGrainedSchedulerBackend]
     val mockEndpointRef = mock[RpcEndpointRef]
     val mockAddress = mock[RpcAddress]
@@ -224,7 +227,7 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
         ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
     backend.driverEndpoint.askSync[Boolean](
       RegisterExecutor("3", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources,
-        5))
+        rp.id))
 
     val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf)
     val bytebuffer = java.nio.ByteBuffer.allocate(frameSize - 100)
@@ -234,7 +237,7 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
     assert(execResources(GPU).availableAddrs.sorted === Array("0", "1", "3"))
 
     var exec3ResourceProfileId = backend.getExecutorResourceProfileId("3")
-    assert(exec3ResourceProfileId === 5)
+    assert(exec3ResourceProfileId === rp.id)
 
     val taskResources = Map(GPU -> new ResourceInformation(GPU, Array("0")))
     var taskDescs: Seq[Seq[TaskDescription]] = Seq(Seq(new TaskDescription(1, 0, "1",
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 286924001e920..61ea21fa86c5a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -38,6 +38,7 @@ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
 import org.apache.spark.internal.Logging
 import org.apache.spark.io._
 import org.apache.spark.metrics.{ExecutorMetricType, MetricsSystem}
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.{JsonProtocol, Utils}
 
@@ -438,12 +439,14 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
 
   private def createStageSubmittedEvent(stageId: Int) = {
     SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0,
-      Seq.empty, Seq.empty, "details"))
+      Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
   }
 
   private def createStageCompletedEvent(stageId: Int) = {
     SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0,
-      Seq.empty, Seq.empty, "details"))
+      Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
   }
 
   private def createExecutorAddedEvent(executorId: Int) = {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala
index 615389ae5c2d4..3596a9ebb1f5a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark._
 import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.config._
 import org.apache.spark.resource.ResourceProfile.{DEFAULT_RESOURCE_PROFILE_ID, UNKNOWN_RESOURCE_PROFILE_ID}
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.storage._
@@ -255,25 +256,28 @@ class ExecutorMonitorSuite extends SparkFunSuite {
   test("track executors pending for removal") {
     knownExecs ++= Set("1", "2", "3")
 
+    val execInfoRp1 = new ExecutorInfo("host1", 1, Map.empty,
+      Map.empty, Map.empty, 1)
+
     monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo))
     monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", execInfo))
-    monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", execInfo))
+    monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", execInfoRp1))
     clock.setTime(idleDeadline)
-    assert(monitor.timedOutExecutors().toSet === Set("1", "2", "3"))
+    assert(monitor.timedOutExecutors().toSet === Set(("1", 0), ("2", 0), ("3", 1)))
     assert(monitor.pendingRemovalCount === 0)
 
     // Notify that only a subset of executors was killed, to mimic the case where the scheduler
     // refuses to kill an executor that is busy for whatever reason the monitor hasn't detected yet.
     monitor.executorsKilled(Seq("1"))
-    assert(monitor.timedOutExecutors().toSet === Set("2", "3"))
+    assert(monitor.timedOutExecutors().toSet === Set(("2", 0), ("3", 1)))
     assert(monitor.pendingRemovalCount === 1)
 
     // Check the timed out executors again so that we're sure they're still timed out when no
     // events happen. This ensures that the monitor doesn't lose track of them.
-    assert(monitor.timedOutExecutors().toSet === Set("2", "3"))
+    assert(monitor.timedOutExecutors().toSet === Set(("2", 0), ("3", 1)))
 
     monitor.onTaskStart(SparkListenerTaskStart(1, 1, taskInfo("2", 1)))
-    assert(monitor.timedOutExecutors().toSet === Set("3"))
+    assert(monitor.timedOutExecutors().toSet === Set(("3", 1)))
 
     monitor.executorsKilled(Seq("3"))
     assert(monitor.pendingRemovalCount === 2)
@@ -282,7 +286,7 @@ class ExecutorMonitorSuite extends SparkFunSuite {
       new ExecutorMetrics, null))
     assert(monitor.timedOutExecutors().isEmpty)
     clock.advance(idleDeadline)
-    assert(monitor.timedOutExecutors().toSet === Set("2"))
+    assert(monitor.timedOutExecutors().toSet === Set(("2", 0)))
   }
 
   test("shuffle block tracking") {
@@ -435,7 +439,8 @@ class ExecutorMonitorSuite extends SparkFunSuite {
 
   private def stageInfo(id: Int, shuffleId: Int = -1): StageInfo = {
     new StageInfo(id, 0, s"stage$id", 1, Nil, Nil, "",
-      shuffleDepId = if (shuffleId >= 0) Some(shuffleId) else None)
+      shuffleDepId = if (shuffleId >= 0) Some(shuffleId) else None,
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
   }
 
   private def taskInfo(
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 255f91866ef58..24eb1685f577a 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark._
 import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
 import org.apache.spark.internal.config.Status._
 import org.apache.spark.metrics.ExecutorMetricType
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster._
 import org.apache.spark.status.ListenerEventsTestHelper._
@@ -151,8 +152,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // Start a job with 2 stages / 4 tasks each
     time += 1
     val stages = Seq(
-      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"),
-      new StageInfo(2, 0, "stage2", 4, Nil, Seq(1), "details2"))
+      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+      new StageInfo(2, 0, "stage2", 4, Nil, Seq(1), "details2",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
 
     val jobProps = new Properties()
     jobProps.setProperty(SparkContext.SPARK_JOB_DESCRIPTION, "jobDescription")
@@ -524,7 +527,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // - Re-submit stage 2, all tasks, and succeed them and the stage.
     val oldS2 = stages.last
     val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptNumber + 1, oldS2.name, oldS2.numTasks,
-      oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics)
+      oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics,
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     time += 1
     newS2.submissionTime = Some(time)
@@ -575,8 +579,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // change the stats of the already finished job.
     time += 1
     val j2Stages = Seq(
-      new StageInfo(3, 0, "stage1", 4, Nil, Nil, "details1"),
-      new StageInfo(4, 0, "stage2", 4, Nil, Seq(3), "details2"))
+      new StageInfo(3, 0, "stage1", 4, Nil, Nil, "details1",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+      new StageInfo(4, 0, "stage2", 4, Nil, Seq(3), "details2",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
     j2Stages.last.submissionTime = Some(time)
     listener.onJobStart(SparkListenerJobStart(2, time, j2Stages, null))
     assert(store.count(classOf[JobDataWrapper]) === 2)
@@ -703,7 +709,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // Submit a stage for the first RDD before it's marked for caching, to make sure later
     // the listener picks up the correct storage level.
     val rdd1Info = new RDDInfo(rdd1b1.rddId, "rdd1", 2, StorageLevel.NONE, false, Nil)
-    val stage0 = new StageInfo(0, 0, "stage0", 4, Seq(rdd1Info), Nil, "details0")
+    val stage0 = new StageInfo(0, 0, "stage0", 4, Seq(rdd1Info), Nil, "details0",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     listener.onStageSubmitted(SparkListenerStageSubmitted(stage0, new Properties()))
     listener.onStageCompleted(SparkListenerStageCompleted(stage0))
     assert(store.count(classOf[RDDStorageInfoWrapper]) === 0)
@@ -711,7 +718,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // Submit a stage and make sure the RDDs are recorded.
     rdd1Info.storageLevel = level
     val rdd2Info = new RDDInfo(rdd2b1.rddId, "rdd2", 1, level, false, Nil)
-    val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info, rdd2Info), Nil, "details1")
+    val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info, rdd2Info), Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties()))
 
     check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
@@ -1018,9 +1026,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // data is not deleted.
     time += 1
     val stages = Seq(
-      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"),
-      new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2"),
-      new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3"))
+      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+      new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
+      new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
 
     // Graph data is generated by the job start event, so fire it.
     listener.onJobStart(SparkListenerJobStart(4, time, stages, null))
@@ -1068,7 +1079,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
     assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 0)
 
-    val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3")
+    val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     time += 1
     attempt2.submissionTime = Some(time)
     listener.onStageSubmitted(SparkListenerStageSubmitted(attempt2, new Properties()))
@@ -1139,9 +1151,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2)
     val listener = new AppStatusListener(store, testConf, true)
 
-    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
-    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
-    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     // Start stage 1 and stage 2
     time += 1
@@ -1172,8 +1187,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2)
     val listener = new AppStatusListener(store, testConf, true)
 
-    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
-    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     // Sart job 1
     time += 1
@@ -1193,7 +1210,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded))
 
     // Submit stage 3 and verify stage 2 is evicted
-    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     time += 1
     stage3.submissionTime = Some(time)
     listener.onStageSubmitted(SparkListenerStageSubmitted(stage3, new Properties()))
@@ -1208,7 +1226,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val testConf = conf.clone().set(MAX_RETAINED_TASKS_PER_STAGE, 2)
     val listener = new AppStatusListener(store, testConf, true)
 
-    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     stage1.submissionTime = Some(time)
     listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties()))
 
@@ -1243,9 +1262,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val listener = new AppStatusListener(store, testConf, true)
     val appStore = new AppStatusStore(store)
 
-    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
-    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
-    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     time += 1
     stage1.submissionTime = Some(time)
@@ -1274,8 +1296,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
   test("SPARK-24415: update metrics for tasks that finish late") {
     val listener = new AppStatusListener(store, conf, true)
 
-    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1")
-    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2")
+    val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
     // Start job
     listener.onJobStart(SparkListenerJobStart(1, time, Seq(stage1, stage2), null))
@@ -1340,7 +1364,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
       listener.onExecutorAdded(createExecutorAddedEvent(1))
       listener.onExecutorAdded(createExecutorAddedEvent(2))
-      val stage = new StageInfo(1, 0, "stage", 4, Nil, Nil, "details")
+      val stage = new StageInfo(1, 0, "stage", 4, Nil, Nil, "details",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
       listener.onJobStart(SparkListenerJobStart(1, time, Seq(stage), null))
       listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties()))
 
@@ -1577,7 +1602,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Submit a stage and make sure the RDDs are recorded.
     val rdd1Info = new RDDInfo(rdd1b1.rddId, "rdd1", 2, level, false, Nil)
-    val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info), Nil, "details1")
+    val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info), Nil, "details1",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties()))
 
     // Add partition 1 replicated on two block managers.
diff --git a/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala b/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala
index 4b3fbacc47f9c..99c0d9593ccae 100644
--- a/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala
+++ b/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala
@@ -23,6 +23,7 @@ import scala.collection.immutable.Map
 
 import org.apache.spark.{AccumulatorSuite, SparkContext, Success, TaskState}
 import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded, SparkListenerExecutorMetricsUpdate, SparkListenerExecutorRemoved, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerStageSubmitted, SparkListenerTaskEnd, SparkListenerTaskStart, StageInfo, TaskInfo, TaskLocality}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.storage.{RDDInfo, StorageLevel}
@@ -61,7 +62,8 @@ object ListenerEventsTestHelper {
   }
 
   def createStage(id: Int, rdds: Seq[RDDInfo], parentIds: Seq[Int]): StageInfo = {
-    new StageInfo(id, 0, s"stage${id}", 4, rdds, parentIds, s"details${id}")
+    new StageInfo(id, 0, s"stage${id}", 4, rdds, parentIds, s"details${id}",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
   }
 
   def createStage(rdds: Seq[RDDInfo], parentIds: Seq[Int]): StageInfo = {
@@ -96,13 +98,15 @@ object ListenerEventsTestHelper {
   /** Create a stage submitted event for the specified stage Id. */
   def createStageSubmittedEvent(stageId: Int): SparkListenerStageSubmitted = {
     SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0,
-      Seq.empty, Seq.empty, "details"))
+      Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
   }
 
   /** Create a stage completed event for the specified stage Id. */
   def createStageCompletedEvent(stageId: Int): SparkListenerStageCompleted = {
     SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0,
-      Seq.empty, Seq.empty, "details"))
+      Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
   }
 
   def createExecutorAddedEvent(executorId: Int): SparkListenerExecutorAdded = {
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index bd18e9e628da8..7711934cbe8a6 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -27,6 +27,7 @@ import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS}
 import org.apache.spark._
 import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
 import org.apache.spark.internal.config.Status._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
 import org.apache.spark.status.api.v1.{AccumulableInfo => UIAccumulableInfo, StageData, StageStatus}
@@ -131,7 +132,8 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
       val page = new StagePage(tab, statusStore)
 
       // Simulate a stage in job progress listener
-      val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details")
+      val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
       // Simulate two tasks to test PEAK_EXECUTION_MEMORY correctness
       (1 to 2).foreach {
         taskId =>
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index a2a4b3aa974fc..edc0662a0f73e 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -32,8 +32,7 @@ import org.apache.spark._
 import org.apache.spark.executor._
 import org.apache.spark.metrics.ExecutorMetricType
 import org.apache.spark.rdd.RDDOperationScope
-import org.apache.spark.resource.ResourceInformation
-import org.apache.spark.resource.ResourceUtils
+import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceUtils}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.shuffle.MetadataFetchFailedException
@@ -341,7 +340,8 @@ class JsonProtocolSuite extends SparkFunSuite {
     val stageIds = Seq[Int](1, 2, 3, 4)
     val stageInfos = stageIds.map(x => makeStageInfo(x, x * 200, x * 300, x * 400L, x * 500L))
     val dummyStageInfos =
-      stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown"))
+      stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown",
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID))
     val jobStart = SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties)
     val oldEvent = JsonProtocol.jobStartToJson(jobStart).removeField({_._1 == "Stage Infos"})
     val expectedJobStart =
@@ -383,9 +383,11 @@ class JsonProtocolSuite extends SparkFunSuite {
 
   test("StageInfo backward compatibility (parent IDs)") {
     // Prior to Spark 1.4.0, StageInfo did not have the "Parent IDs" property
-    val stageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq(1, 2, 3), "details")
+    val stageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq(1, 2, 3), "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     val oldStageInfo = JsonProtocol.stageInfoToJson(stageInfo).removeField({ _._1 == "Parent IDs"})
-    val expectedStageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq.empty, "details")
+    val expectedStageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq.empty, "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     assertEquals(expectedStageInfo, JsonProtocol.stageInfoFromJson(oldStageInfo))
   }
 
@@ -873,7 +875,8 @@ private[spark] object JsonProtocolSuite extends Assertions {
 
   private def makeStageInfo(a: Int, b: Int, c: Int, d: Long, e: Long) = {
     val rddInfos = (0 until a % 5).map { i => makeRddInfo(a + i, b + i, c + i, d + i, e + i) }
-    val stageInfo = new StageInfo(a, 0, "greetings", b, rddInfos, Seq(100, 200, 300), "details")
+    val stageInfo = new StageInfo(a, 0, "greetings", b, rddInfos, Seq(100, 200, 300), "details",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
     val (acc1, acc2) = (makeAccumulableInfo(1), makeAccumulableInfo(2))
     stageInfo.accumulables(acc1.id) = acc1
     stageInfo.accumulables(acc2.id) = acc2
diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py
index c7f435a582210..edfea42bed71d 100644
--- a/python/pyspark/tests/test_context.py
+++ b/python/pyspark/tests/test_context.py
@@ -275,9 +275,13 @@ def setUp(self):
         self.tempFile = tempfile.NamedTemporaryFile(delete=False)
         self.tempFile.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [\\"0\\"]}')
         self.tempFile.close()
+        # create temporary directory for Worker resources coordination
+        self.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        os.unlink(self.tempdir.name)
         os.chmod(self.tempFile.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                  stat.S_IROTH | stat.S_IXOTH)
         conf = SparkConf().set("spark.test.home", SPARK_HOME)
+        conf = conf.set("spark.resources.dir", self.tempdir.name)
         conf = conf.set("spark.driver.resource.gpu.amount", "1")
         conf = conf.set("spark.driver.resource.gpu.discoveryScript", self.tempFile.name)
         self.sc = SparkContext('local-cluster[2,1,1024]', class_name, conf=conf)
@@ -292,6 +296,7 @@ def test_resources(self):
 
     def tearDown(self):
         os.unlink(self.tempFile.name)
+        shutil.rmtree(self.tempdir.name)
         self.sc.stop()
 
 
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 6095a384679af..68cfe814762e0 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -16,6 +16,7 @@
 #
 import os
 import random
+import shutil
 import stat
 import sys
 import tempfile
@@ -277,9 +278,13 @@ def setUp(self):
         self.tempFile = tempfile.NamedTemporaryFile(delete=False)
         self.tempFile.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [\\"0\\"]}')
         self.tempFile.close()
+        # create temporary directory for Worker resources coordination
+        self.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        os.unlink(self.tempdir.name)
         os.chmod(self.tempFile.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                  stat.S_IROTH | stat.S_IXOTH)
         conf = SparkConf().set("spark.test.home", SPARK_HOME)
+        conf = conf.set("spark.resources.dir", self.tempdir.name)
         conf = conf.set("spark.worker.resource.gpu.discoveryScript", self.tempFile.name)
         conf = conf.set("spark.worker.resource.gpu.amount", 1)
         conf = conf.set("spark.task.resource.gpu.amount", "1")
@@ -297,6 +302,7 @@ def test_resources(self):
 
     def tearDown(self):
         os.unlink(self.tempFile.name)
+        shutil.rmtree(self.tempdir.name)
         self.sc.stop()
 
 if __name__ == "__main__":
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index 105841ac834b3..5655ef50d214f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -27,6 +27,7 @@ import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.config.SCHEDULER_MIN_REGISTERED_RESOURCES_RATIO
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.scheduler.{ExecutorKilled, ExecutorLossReason, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SchedulerBackendUtils}
@@ -55,6 +56,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private val shouldDeleteExecutors = conf.get(KUBERNETES_DELETE_EXECUTORS)
 
+  private val defaultProfile = scheduler.sc.resourceProfileManager.defaultResourceProfile
+
   // Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler
   private[k8s] def doRemoveExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
     if (isExecutorActive(executorId)) {
@@ -116,8 +119,9 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
-  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
-    podAllocator.setTotalExpectedExecutors(requestedTotal)
+  override def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = {
+    podAllocator.setTotalExpectedExecutors(resourceProfileToTotalExecs(defaultProfile))
     Future.successful(true)
   }
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index 7e1e39c85a183..8c683e85dd5e2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkFunSuite}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.k8s.Fabric8Aliases._
+import org.apache.spark.resource.ResourceProfileManager
 import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef, RpcEnv}
 import org.apache.spark.scheduler.{ExecutorKilled, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
@@ -86,10 +87,13 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   private var driverEndpoint: ArgumentCaptor[RpcEndpoint] = _
   private var schedulerBackendUnderTest: KubernetesClusterSchedulerBackend = _
 
+  private val resourceProfileManager = new ResourceProfileManager(sparkConf)
+
   before {
     MockitoAnnotations.initMocks(this)
     when(taskScheduler.sc).thenReturn(sc)
     when(sc.conf).thenReturn(sparkConf)
+    when(sc.resourceProfileManager).thenReturn(resourceProfileManager)
     when(sc.env).thenReturn(env)
     when(env.rpcEnv).thenReturn(rpcEnv)
     driverEndpoint = ArgumentCaptor.forClass(classOf[RpcEndpoint])
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index e916125ffdb67..0b447025c8a7a 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -38,6 +38,7 @@ import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalBlockStoreClient
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.rpc.{RpcEndpointAddress, RpcEndpointRef}
 import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
@@ -181,6 +182,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
 
   private var schedulerDriver: SchedulerDriver = _
 
+  private val defaultProfile = sc.resourceProfileManager.defaultResourceProfile
+
+
   def newMesosTaskId(): String = {
     val id = nextMesosTaskId
     nextMesosTaskId += 1
@@ -595,13 +599,16 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   }
 
   private def satisfiesLocality(offerHostname: String): Boolean = {
+    val hostToLocalTaskCount =
+      rpHostToLocalTaskCount.getOrElse(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID, Map.empty)
     if (!Utils.isDynamicAllocationEnabled(conf) || hostToLocalTaskCount.isEmpty) {
       return true
     }
 
     // Check the locality information
     val currentHosts = slaves.values.filter(_.taskIDs.nonEmpty).map(_.hostname).toSet
-    val allDesiredHosts = hostToLocalTaskCount.keys.toSet
+    val allDesiredHosts = hostToLocalTaskCount.map { case (k, v) => k }.toSet
+
     // Try to match locality for hosts which do not have executors yet, to potentially
     // increase coverage.
     val remainingHosts = allDesiredHosts -- currentHosts
@@ -759,11 +766,14 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       super.applicationId
     }
 
-  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future.successful {
+  override def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]
+  ): Future[Boolean] = Future.successful {
     // We don't truly know if we can fulfill the full amount of executors
     // since at coarse grain it depends on the amount of slaves available.
-    logInfo("Capping the total amount of executors to " + requestedTotal)
-    executorLimitOption = Some(requestedTotal)
+    val numExecs = resourceProfileToTotalExecs.getOrElse(defaultProfile, 0)
+    logInfo("Capping the total amount of executors to " + numExecs)
+    executorLimitOption = Some(numExecs)
     // Update the locality wait start time to continue trying for locality.
     localityWaitStartTimeNs = System.nanoTime()
     true
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index 1876861700fc0..5ab277ed87a72 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -71,8 +71,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     offerResources(offers)
     verifyTaskLaunched(driver, "o1")
 
+    val totalExecs = Map(ResourceProfile.getOrCreateDefaultProfile(sparkConf) -> 0)
     // kills executors
-    assert(backend.doRequestTotalExecutors(0).futureValue)
+    val defaultResourceProfile = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+    assert(backend.doRequestTotalExecutors(Map(defaultResourceProfile -> 0)).futureValue)
     assert(backend.doKillExecutors(Seq("0")).futureValue)
     val taskID0 = createTaskId("0")
     verify(driver, times(1)).killTask(taskID0)
@@ -82,7 +84,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     verifyDeclinedOffer(driver, createOfferId("o2"))
 
     // Launches a new task when requested executors is positive
-    backend.doRequestTotalExecutors(2)
+    backend.doRequestTotalExecutors(Map(defaultResourceProfile -> 2))
     offerResources(offers, 2)
     verifyTaskLaunched(driver, "o2")
   }
@@ -635,7 +637,12 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
 
     assert(backend.getExecutorIds().isEmpty)
 
-    backend.requestTotalExecutors(2, 2, Map("hosts10" -> 1, "hosts11" -> 1))
+    val defaultProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+    val defaultProf = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+    backend.requestTotalExecutors(
+      Map(defaultProfileId -> 2),
+      Map(defaultProfileId -> 2),
+      Map(defaultProfileId -> Map("hosts10" -> 1, "hosts11" -> 1)))
 
     // Offer non-local resources, which should be rejected
     offerResourcesAndVerify(1, false)
@@ -651,7 +658,11 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     offerResourcesAndVerify(1, true)
 
     // Update total executors
-    backend.requestTotalExecutors(3, 3, Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1))
+    backend.requestTotalExecutors(
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 3),
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 2),
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID ->
+        Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1)))
 
     // Offer non-local resources, which should be rejected
     offerResourcesAndVerify(3, false)
@@ -660,8 +671,11 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     Thread.sleep(2000)
 
     // Update total executors
-    backend.requestTotalExecutors(4, 4, Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1,
-      "hosts13" -> 1))
+    backend.requestTotalExecutors(
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 4),
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 4),
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID ->
+            Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1, "hosts13" -> 1)))
 
     // Offer non-local resources, which should be rejected
     offerResourcesAndVerify(3, false)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 471ee58d05cb8..f8bbc39c8bcc5 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -27,13 +27,13 @@ import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
-import org.eclipse.jetty.servlet.{FilterHolder, FilterMapping}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config
 import org.apache.spark.internal.config.UI._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -92,7 +92,7 @@ private[spark] abstract class YarnSchedulerBackend(
     try {
       // SPARK-12009: To prevent Yarn allocator from requesting backup for the executors which
       // was Stopped by SchedulerBackend.
-      requestTotalExecutors(0, 0, Map.empty)
+      requestTotalExecutors(Map.empty, Map.empty, Map.empty)
       super.stop()
     } finally {
       stopped.set(true)
@@ -123,21 +123,28 @@ private[spark] abstract class YarnSchedulerBackend(
     }
   }
 
-  private[cluster] def prepareRequestExecutors(requestedTotal: Int): RequestExecutors = {
+  private[cluster] def prepareRequestExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): RequestExecutors = {
     val nodeBlacklist: Set[String] = scheduler.nodeBlacklist()
     // For locality preferences, ignore preferences for nodes that are blacklisted
-    val filteredHostToLocalTaskCount =
-      hostToLocalTaskCount.filter { case (k, v) => !nodeBlacklist.contains(k) }
-    RequestExecutors(requestedTotal, localityAwareTasks, filteredHostToLocalTaskCount,
-      nodeBlacklist)
+    val filteredRPHostToLocalTaskCount = rpHostToLocalTaskCount.map { case (rpid, v) =>
+      (rpid, v.filter { case (host, count) => !nodeBlacklist.contains(host) })
+    }
+    // TODO - default everything to default profile until YARN pieces
+    val defaultProf = ResourceProfile.getOrCreateDefaultProfile(conf)
+    val hostToLocalTaskCount = filteredRPHostToLocalTaskCount.getOrElse(defaultProf.id, Map.empty)
+    val localityAwareTasks = numLocalityAwareTasksPerResourceProfileId.getOrElse(defaultProf.id, 0)
+    val numExecutors = resourceProfileToTotalExecs.getOrElse(defaultProf, 0)
+    RequestExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount, nodeBlacklist)
   }
 
   /**
    * Request executors from the ApplicationMaster by specifying the total number desired.
    * This includes executors already pending or running.
    */
-  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
-    yarnSchedulerEndpointRef.ask[Boolean](prepareRequestExecutors(requestedTotal))
+  override def doRequestTotalExecutors(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = {
+    yarnSchedulerEndpointRef.ask[Boolean](prepareRequestExecutors(resourceProfileToTotalExecs))
   }
 
   /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
index a87820b1528ad..c0c6fff5130bb 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
@@ -24,6 +24,7 @@ import org.mockito.Mockito.when
 import org.scalatestplus.mockito.MockitoSugar
 
 import org.apache.spark._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.ui.TestFilter
@@ -51,7 +52,8 @@ class YarnSchedulerBackendSuite extends SparkFunSuite with MockitoSugar with Loc
   private class TestYarnSchedulerBackend(scheduler: TaskSchedulerImpl, sc: SparkContext)
       extends YarnSchedulerBackend(scheduler, sc) {
     def setHostToLocalTaskCount(hostToLocalTaskCount: Map[String, Int]): Unit = {
-      this.hostToLocalTaskCount = hostToLocalTaskCount
+      this.rpHostToLocalTaskCount = Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID ->
+        hostToLocalTaskCount)
     }
   }
 
@@ -72,7 +74,8 @@ class YarnSchedulerBackendSuite extends SparkFunSuite with MockitoSugar with Loc
     } {
       yarnSchedulerBackendExtended.setHostToLocalTaskCount(hostToLocalCount)
       sched.setNodeBlacklist(blacklist)
-      val req = yarnSchedulerBackendExtended.prepareRequestExecutors(numRequested)
+      val numReq = Map(ResourceProfile.getOrCreateDefaultProfile(sc.getConf) -> numRequested)
+      val req = yarnSchedulerBackendExtended.prepareRequestExecutors(numReq)
       assert(req.requestedTotal === numRequested)
       assert(req.nodeBlacklist === blacklist)
       assert(req.hostToLocalTaskCount.keySet.intersect(blacklist).isEmpty)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala
index a88abc8209a88..c09ff51ecaff2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala
@@ -27,6 +27,7 @@ import org.apache.spark.{SparkConf, TaskState}
 import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
 import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.config.Status._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.SQLMetricInfo
@@ -89,7 +90,8 @@ object MetricsAggregationBenchmark extends BenchmarkBase {
 
     val taskEventsTime = (0 until numStages).map { _ =>
       val stageInfo = new StageInfo(idgen.incrementAndGet(), 0, getClass().getName(),
-        numTasks, Nil, Nil, getClass().getName())
+        numTasks, Nil, Nil, getClass().getName(),
+        resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
 
       val jobId = idgen.incrementAndGet()
       val jobStart = SparkListenerJobStart(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index 55b551d0af078..fdfd392a224cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.config
 import org.apache.spark.internal.config.Status._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -86,7 +87,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils
       name = "",
       rddInfos = Nil,
       parentIds = Nil,
-      details = "")
+      details = "",
+      resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
   }
 
   private def createTaskInfo(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala
index e85a3b9009c32..58bd56c591d04 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala
@@ -23,6 +23,7 @@ import scala.util.Random
 import org.apache.spark.{ExecutorAllocationClient, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.Streaming._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.streaming.util.RecurringTimer
 import org.apache.spark.util.{Clock, Utils}
 
@@ -111,7 +112,11 @@ private[streaming] class ExecutorAllocationManager(
     logDebug(s"Executors (${allExecIds.size}) = ${allExecIds}")
     val targetTotalExecutors =
       math.max(math.min(maxNumExecutors, allExecIds.size + numNewExecutors), minNumExecutors)
-    client.requestTotalExecutors(targetTotalExecutors, 0, Map.empty)
+    // Just map the targetTotalExecutors to the default ResourceProfile
+    client.requestTotalExecutors(
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> targetTotalExecutors),
+      Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0),
+      Map.empty)
     logInfo(s"Requested total $targetTotalExecutors executors")
   }
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
index 9121da4b9b673..65efa10bfcf92 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
@@ -27,6 +27,7 @@ import org.scalatestplus.mockito.MockitoSugar
 import org.apache.spark.{ExecutorAllocationClient, SparkConf}
 import org.apache.spark.internal.config.{DYN_ALLOCATION_ENABLED, DYN_ALLOCATION_TESTING}
 import org.apache.spark.internal.config.Streaming._
+import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.streaming.{DummyInputDStream, Seconds, StreamingContext, TestSuiteBase}
 import org.apache.spark.util.{ManualClock, Utils}
 
@@ -71,10 +72,15 @@ class ExecutorAllocationManagerSuite extends TestSuiteBase
         if (expectedRequestedTotalExecs.nonEmpty) {
           require(expectedRequestedTotalExecs.get > 0)
           verify(allocationClient, times(1)).requestTotalExecutors(
-            meq(expectedRequestedTotalExecs.get), meq(0), meq(Map.empty))
+              meq(Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID ->
+                expectedRequestedTotalExecs.get)),
+              meq(Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0)),
+              meq(Map.empty))
         } else {
-          verify(allocationClient, never).requestTotalExecutors(0, 0, Map.empty)
-        }
+          verify(allocationClient, never).requestTotalExecutors(
+            Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0),
+            Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0),
+            Map.empty)}
       }
 
       /** Verify that a particular executor was killed */
@@ -139,8 +145,11 @@ class ExecutorAllocationManagerSuite extends TestSuiteBase
       reset(allocationClient)
       when(allocationClient.getExecutorIds()).thenReturn((1 to numExecs).map(_.toString))
       requestExecutors(allocationManager, numNewExecs)
-      verify(allocationClient, times(1)).requestTotalExecutors(
-        meq(expectedRequestedTotalExecs), meq(0), meq(Map.empty))
+      val defaultProfId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+      verify(allocationClient, times(1)).
+        requestTotalExecutors(
+          meq(Map(defaultProfId -> expectedRequestedTotalExecs)),
+          meq(Map(defaultProfId -> 0)), meq(Map.empty))
     }
 
     withAllocationManager(numReceivers = 1) { case (_, allocationManager) =>

From 926e3a1efe9e142804fcbf52146b22700640ae1b Mon Sep 17 00:00:00 2001
From: iRakson <raksonrakesh@gmail.com>
Date: Thu, 13 Feb 2020 12:23:40 +0800
Subject: [PATCH 030/185] [SPARK-30790] The dataType of map() should be
 map<null,null>

### What changes were proposed in this pull request?

`spark.sql("select map()")` returns {}.

After these changes it will return map<null,null>

### Why are the changes needed?
After changes introduced due to #27521, it is important to maintain consistency while using map().

### Does this PR introduce any user-facing change?
Yes. Now map() will give map<null,null> instead of {}.

### How was this patch tested?
UT added. Migration guide updated as well

Closes #27542 from iRakson/SPARK-30790.

Authored-by: iRakson <raksonrakesh@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                   |  2 +-
 .../expressions/complexTypeCreator.scala      | 14 ++++++++---
 .../catalyst/util/ArrayBasedMapBuilder.scala  |  5 ++--
 .../apache/spark/sql/internal/SQLConf.scala   | 10 ++++----
 .../spark/sql/DataFrameFunctionsSuite.scala   | 25 +++++++++++++------
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index f98fab5b4c56b..46b741687363f 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -216,7 +216,7 @@ license: |
 
   - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`.
   
-  - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`.
+  - Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType` to `true`.
 
   - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception.
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 7335e305bfe55..4bd85d304ded2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -46,7 +46,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
   }
 
   private val defaultElementType: DataType = {
-    if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) {
+    if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
       StringType
     } else {
       NullType
@@ -145,6 +145,14 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
   lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
   lazy val values = children.indices.filter(_ % 2 != 0).map(children)
 
+  private val defaultElementType: DataType = {
+    if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
+      StringType
+    } else {
+      NullType
+    }
+  }
+
   override def foldable: Boolean = children.forall(_.foldable)
 
   override def checkInputDataTypes(): TypeCheckResult = {
@@ -167,9 +175,9 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
   override lazy val dataType: MapType = {
     MapType(
       keyType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(keys.map(_.dataType))
-        .getOrElse(StringType),
+        .getOrElse(defaultElementType),
       valueType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(values.map(_.dataType))
-        .getOrElse(StringType),
+        .getOrElse(defaultElementType),
       valueContainsNull = values.exists(_.nullable))
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
index 98934368205ec..37d65309e2b89 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
@@ -29,12 +29,11 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
  */
 class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Serializable {
   assert(!keyType.existsRecursively(_.isInstanceOf[MapType]), "key of map cannot be/contain map")
-  assert(keyType != NullType, "map key cannot be null type.")
 
   private lazy val keyToIndex = keyType match {
     // Binary type data is `byte[]`, which can't use `==` to check equality.
-    case _: AtomicType | _: CalendarIntervalType if !keyType.isInstanceOf[BinaryType] =>
-      new java.util.HashMap[Any, Int]()
+    case _: AtomicType | _: CalendarIntervalType | _: NullType
+      if !keyType.isInstanceOf[BinaryType] => new java.util.HashMap[Any, Int]()
     case _ =>
       // for complex types, use interpreted ordering to be able to compare unsafe data with safe
       // data, e.g. UnsafeRow vs GenericInternalRow.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index d86f8693e0655..95b5b3afc3933 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2007,12 +2007,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val LEGACY_ARRAY_DEFAULT_TO_STRING =
-    buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled")
+  val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE =
+    buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType")
       .internal()
-      .doc("When set to true, it returns an empty array of string type when the `array` " +
-        "function is called without any parameters. Otherwise, it returns an empty " +
-        "array of `NullType`")
+      .doc("When set to true, Spark returns an empty collection with `StringType` as element " +
+        "type if the `array`/`map` function is called without any parameters. Otherwise, Spark " +
+        "returns an empty collection with `NullType` as element type.")
       .booleanConf
       .createWithDefault(false)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6012678341ccc..f7531ea446015 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3499,13 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
 
-  test("SPARK-21281 use string types by default if map have no argument") {
-    val ds = spark.range(1)
-    var expectedSchema = new StructType()
-      .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false)
-    assert(ds.select(map().as("x")).schema == expectedSchema)
-  }
-
   test("SPARK-21281 fails if functions have no argument") {
     val df = Seq(1).toDF("a")
 
@@ -3563,7 +3556,8 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
   test("SPARK-29462: Empty array of NullType for array function with no arguments") {
     Seq((true, StringType), (false, NullType)).foreach {
       case (arrayDefaultToString, expectedType) =>
-        withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) {
+        withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+          arrayDefaultToString.toString) {
           val schema = spark.range(1).select(array()).schema
           assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType])
           val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType
@@ -3571,6 +3565,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
         }
     }
   }
+
+  test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") {
+    Seq((true, StringType), (false, NullType)).foreach {
+      case (mapDefaultToString, expectedType) =>
+        withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+          mapDefaultToString.toString) {
+          val schema = spark.range(1).select(map()).schema
+          assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType])
+          val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType
+          val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType
+          assert(actualKeyType === expectedType)
+          assert(actualValueType === expectedType)
+        }
+    }
+  }
 }
 
 object DataFrameFunctionsSuite {

From 453d5261b22ebcdd5886e65ab9d0d9857051e76a Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Thu, 13 Feb 2020 19:32:38 +0800
Subject: [PATCH 031/185] [SPARK-30528][SQL] Turn off DPP subquery duplication
 by default

### What changes were proposed in this pull request?
This PR adds a config for Dynamic Partition Pruning subquery duplication and turns it off by default due to its potential performance regression.
When planning a DPP filter, it seeks to reuse the broadcast exchange relation if the corresponding join is a BHJ with the filter relation being on the build side, otherwise it will either opt out or plan the filter as an un-reusable subquery duplication based on the cost estimate. However, the cost estimate is not accurate and only takes into account the table scan overhead, thus adding an un-reusable subquery duplication DPP filter can sometimes cause perf regression.
This PR turns off the subquery duplication DPP filter by:
1. adding a config `spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcastOnly` and setting it `true` by default.
2. removing the existing meaningless config `spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcast` since we always want to reuse broadcast results if possible.

### Why are the changes needed?
This is to fix a potential performance regression caused by DPP.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Updated DynamicPartitionPruningSuite to test the new configuration.

Closes #27551 from maryannxue/spark-30528.

Authored-by: maryannxue <maryannxue@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   |  12 +-
 .../sql/dynamicpruning/PartitionPruning.scala |   4 +-
 .../PlanDynamicPruningFilters.scala           |   5 +-
 .../sql/DynamicPartitionPruningSuite.scala    | 183 +++++++-----------
 .../org/apache/spark/sql/ExplainSuite.scala   |   3 +-
 5 files changed, 82 insertions(+), 125 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 95b5b3afc3933..2214e03f34f0b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -259,11 +259,11 @@ object SQLConf {
     .doubleConf
     .createWithDefault(0.5)
 
-  val DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST =
-    buildConf("spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcast")
+  val DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY =
+    buildConf("spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcastOnly")
       .internal()
-      .doc("When true, dynamic partition pruning will seek to reuse the broadcast results from " +
-        "a broadcast hash join operation.")
+      .doc("When true, dynamic partition pruning will only apply when the broadcast exchange of " +
+        "a broadcast hash join operation can be reused as the dynamic pruning filter.")
       .booleanConf
       .createWithDefault(true)
 
@@ -2303,8 +2303,8 @@ class SQLConf extends Serializable with Logging {
   def dynamicPartitionPruningFallbackFilterRatio: Double =
     getConf(DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO)
 
-  def dynamicPartitionPruningReuseBroadcast: Boolean =
-    getConf(DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST)
+  def dynamicPartitionPruningReuseBroadcastOnly: Boolean =
+    getConf(DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY)
 
   def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala
index 48ba8618f272e..28f8f49d2ce44 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala
@@ -86,7 +86,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper {
       filteringPlan: LogicalPlan,
       joinKeys: Seq[Expression],
       hasBenefit: Boolean): LogicalPlan = {
-    val reuseEnabled = SQLConf.get.dynamicPartitionPruningReuseBroadcast
+    val reuseEnabled = SQLConf.get.exchangeReuseEnabled
     val index = joinKeys.indexOf(filteringKey)
     if (hasBenefit || reuseEnabled) {
       // insert a DynamicPruning wrapper to identify the subquery during query planning
@@ -96,7 +96,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper {
           filteringPlan,
           joinKeys,
           index,
-          !hasBenefit),
+          !hasBenefit || SQLConf.get.dynamicPartitionPruningReuseBroadcastOnly),
         pruningPlan)
     } else {
       // abort dynamic partition pruning
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala
index 1398dc049dd99..be00f728aa3ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala
@@ -36,9 +36,6 @@ import org.apache.spark.sql.internal.SQLConf
 case class PlanDynamicPruningFilters(sparkSession: SparkSession)
     extends Rule[SparkPlan] with PredicateHelper {
 
-  private def reuseBroadcast: Boolean =
-    SQLConf.get.dynamicPartitionPruningReuseBroadcast && SQLConf.get.exchangeReuseEnabled
-
   /**
    * Identify the shape in which keys of a given plan are broadcasted.
    */
@@ -59,7 +56,7 @@ case class PlanDynamicPruningFilters(sparkSession: SparkSession)
           sparkSession, sparkSession.sessionState.planner, buildPlan)
         // Using `sparkPlan` is a little hacky as it is based on the assumption that this rule is
         // the first to be applied (apart from `InsertAdaptiveSparkPlan`).
-        val canReuseExchange = reuseBroadcast && buildKeys.nonEmpty &&
+        val canReuseExchange = SQLConf.get.exchangeReuseEnabled && buildKeys.nonEmpty &&
           plan.find {
             case BroadcastHashJoinExec(_, _, _, BuildLeft, _, left, _) =>
               left.sameResult(sparkPlan)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
index e1f9bcc4e008d..f7b51d6f4c8ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
@@ -239,7 +239,8 @@ class DynamicPartitionPruningSuite
    */
   test("simple inner join triggers DPP with mock-up tables") {
     withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       withTable("df1", "df2") {
         spark.range(1000)
           .select(col("id"), col("id").as("k"))
@@ -271,7 +272,8 @@ class DynamicPartitionPruningSuite
    */
   test("self-join on a partitioned table should not trigger DPP") {
     withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       withTable("fact") {
         sql(
           s"""
@@ -302,7 +304,8 @@ class DynamicPartitionPruningSuite
    */
   test("static scan metrics") {
     withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       withTable("fact", "dim") {
         spark.range(10)
           .map { x => Tuple3(x, x + 1, 0) }
@@ -370,7 +373,8 @@ class DynamicPartitionPruningSuite
   test("DPP should not be rewritten as an existential join") {
     withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
       SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "1.5",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       val df = sql(
         s"""
            |SELECT * FROM product p WHERE p.store_id NOT IN
@@ -395,7 +399,7 @@ class DynamicPartitionPruningSuite
    */
   test("DPP triggers only for certain types of query") {
     withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false") {
       Given("dynamic partition pruning disabled")
       withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "false") {
         val df = sql(
@@ -433,7 +437,8 @@ class DynamicPartitionPruningSuite
       }
 
       Given("left-semi join with partition column on the left side")
-      withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") {
+      withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
+        SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
         val df = sql(
           """
             |SELECT * FROM fact_sk f
@@ -457,7 +462,8 @@ class DynamicPartitionPruningSuite
       }
 
       Given("right outer join with partition column on the left side")
-      withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") {
+      withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
+        SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
         val df = sql(
           """
             |SELECT * FROM fact_sk f RIGHT OUTER JOIN dim_store s
@@ -474,7 +480,8 @@ class DynamicPartitionPruningSuite
    */
   test("filtering ratio policy fallback") {
     withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       Given("no stats and selective predicate")
       withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
         SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") {
@@ -543,7 +550,8 @@ class DynamicPartitionPruningSuite
    */
   test("filtering ratio policy with stats when the broadcast pruning is disabled") {
     withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       Given("disabling the use of stats in the DPP heuristic")
       withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
         SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false") {
@@ -613,10 +621,7 @@ class DynamicPartitionPruningSuite
 
   test("partition pruning in broadcast hash joins with non-deterministic probe part") {
     Given("alias with simple join condition, and non-deterministic query")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -630,10 +635,7 @@ class DynamicPartitionPruningSuite
     }
 
     Given("alias over multiple sub-queries with simple join condition")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -651,10 +653,7 @@ class DynamicPartitionPruningSuite
 
   test("partition pruning in broadcast hash joins with aliases") {
     Given("alias with simple join condition, using attribute names only")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -674,10 +673,7 @@ class DynamicPartitionPruningSuite
     }
 
     Given("alias with expr as join condition")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -697,10 +693,7 @@ class DynamicPartitionPruningSuite
     }
 
     Given("alias over multiple sub-queries with simple join condition")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -722,10 +715,7 @@ class DynamicPartitionPruningSuite
     }
 
     Given("alias over multiple sub-queries with simple join condition")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid_d as pid, f.sid_d as sid FROM
@@ -754,10 +744,8 @@ class DynamicPartitionPruningSuite
   test("partition pruning in broadcast hash joins") {
     Given("disable broadcast pruning and disable subquery duplication")
     withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false",
-      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       val df = sql(
         """
           |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
@@ -777,9 +765,10 @@ class DynamicPartitionPruningSuite
 
     Given("disable reuse broadcast results and enable subquery duplication")
     withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false",
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
       SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5") {
+      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5",
+      SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
       val df = sql(
         """
           |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
@@ -798,52 +787,47 @@ class DynamicPartitionPruningSuite
     }
 
     Given("enable reuse broadcast results and disable query duplication")
-      withSQLConf(
-        SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-        SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-        SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
-        val df = sql(
-          """
-            |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
-            |JOIN dim_stats s
-            |ON f.store_id = s.store_id WHERE s.country = 'DE'
-          """.stripMargin)
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
+      val df = sql(
+        """
+          |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
+          |JOIN dim_stats s
+          |ON f.store_id = s.store_id WHERE s.country = 'DE'
+        """.stripMargin)
 
-        checkPartitionPruningPredicate(df, false, true)
+      checkPartitionPruningPredicate(df, false, true)
 
-        checkAnswer(df,
-          Row(1030, 2, 10, 3) ::
-          Row(1040, 2, 50, 3) ::
-          Row(1050, 2, 50, 3) ::
-          Row(1060, 2, 50, 3) :: Nil
-        )
+      checkAnswer(df,
+        Row(1030, 2, 10, 3) ::
+        Row(1040, 2, 50, 3) ::
+        Row(1050, 2, 50, 3) ::
+        Row(1060, 2, 50, 3) :: Nil
+      )
     }
 
     Given("disable broadcast hash join and disable query duplication")
-      withSQLConf(
-        SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-        SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-        SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
-        val df = sql(
-          """
-            |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
-            |JOIN dim_stats s
-            |ON f.store_id = s.store_id WHERE s.country = 'DE'
-          """.stripMargin)
+    withSQLConf(
+      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+      val df = sql(
+        """
+          |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f
+          |JOIN dim_stats s
+          |ON f.store_id = s.store_id WHERE s.country = 'DE'
+        """.stripMargin)
 
-        checkPartitionPruningPredicate(df, false, false)
+      checkPartitionPruningPredicate(df, false, false)
 
-        checkAnswer(df,
-          Row(1030, 2, 10, 3) ::
-          Row(1040, 2, 50, 3) ::
-          Row(1050, 2, 50, 3) ::
-          Row(1060, 2, 50, 3) :: Nil
-        )
+      checkAnswer(df,
+        Row(1030, 2, 10, 3) ::
+        Row(1040, 2, 50, 3) ::
+        Row(1050, 2, 50, 3) ::
+        Row(1060, 2, 50, 3) :: Nil
+      )
     }
 
     Given("disable broadcast hash join and enable query duplication")
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
       SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") {
       val df = sql(
@@ -865,9 +849,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("broadcast a single key in a HashedRelation") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("fact", "dim") {
         spark.range(100).select(
           $"id",
@@ -925,9 +907,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("broadcast multiple keys in a LongHashedRelation") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("fact", "dim") {
         spark.range(100).select(
           $"id",
@@ -962,9 +942,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("broadcast multiple keys in an UnsafeHashedRelation") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("fact", "dim") {
         spark.range(100).select(
           $"id",
@@ -999,9 +977,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("different broadcast subqueries with identical children") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("fact", "dim") {
         spark.range(100).select(
           $"id",
@@ -1073,7 +1049,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("avoid reordering broadcast join keys to match input hash partitioning") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
       withTable("large", "dimTwo", "dimThree") {
         spark.range(100).select(
@@ -1123,9 +1099,7 @@ class DynamicPartitionPruningSuite
    * duplicated partitioning keys, also used to uniquely identify the dynamic pruning filters.
    */
   test("dynamic partition pruning ambiguity issue across nested joins") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("store", "date", "item") {
         spark.range(500)
           .select((($"id" + 30) % 50).as("ss_item_sk"),
@@ -1163,9 +1137,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("cleanup any DPP filter that isn't pushed down due to expression id clashes") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       withTable("fact", "dim") {
         spark.range(1000).select($"id".as("A"), $"id".as("AA"))
           .write.partitionBy("A").format(tableFormat).mode("overwrite").saveAsTable("fact")
@@ -1186,10 +1158,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("cleanup any DPP filter that isn't pushed down due to non-determinism") {
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.pid, f.sid FROM
@@ -1204,9 +1173,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("join key with multiple references on the filtering plan") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0",
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true",
       SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
       // when enable AQE, the reusedExchange is inserted when executed.
       withTable("fact", "dim") {
@@ -1240,9 +1207,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("Make sure dynamic pruning works on uncorrelated queries") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT d.store_id,
@@ -1266,10 +1231,7 @@ class DynamicPartitionPruningSuite
 
   test("Plan broadcast pruning only when the broadcast can be reused") {
     Given("dynamic pruning filter on the build side")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT f.date_id, f.store_id, f.product_id, f.units_sold FROM fact_np f
@@ -1288,10 +1250,7 @@ class DynamicPartitionPruningSuite
     }
 
     Given("dynamic pruning filter on the probe side")
-    withSQLConf(
-      SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false",
-      SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       val df = sql(
         """
           |SELECT /*+ BROADCAST(f)*/
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index d9f4d6d5132ae..b591705274110 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -239,7 +239,8 @@ class ExplainSuite extends QueryTest with SharedSparkSession {
   test("explain formatted - check presence of subquery in case of DPP") {
     withTable("df1", "df2") {
       withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true",
-        SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") {
+        SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false",
+        SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") {
         withTable("df1", "df2") {
           spark.range(1000).select(col("id"), col("id").as("k"))
             .write

From a6b4b914f2d2b873b0e9b9d446fda69dc74c3cf8 Mon Sep 17 00:00:00 2001
From: Terry Kim <yuminkim@gmail.com>
Date: Thu, 13 Feb 2020 20:13:36 +0800
Subject: [PATCH 032/185] [SPARK-30613][SQL] Support Hive style REPLACE COLUMNS
 syntax

### What changes were proposed in this pull request?

This PR proposes to support Hive-style `ALTER TABLE ... REPLACE COLUMNS ...` as described in https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Add/ReplaceColumns

The user now can do the following:
```SQL
CREATE TABLE t (col1 int, col2 int) USING Foo;
ALTER TABLE t REPLACE COLUMNS (col2 string COMMENT 'comment2', col3 int COMMENT 'comment3');
```
, which drops the existing columns `col1` and `col2`, and add new columns `col2` and `col3`.

### Why are the changes needed?

This is a new DDL statement. Spark currently supports the Hive-style `ALTER TABLE ... CHANGE COLUMN ...`, so this new addition can be useful.

### Does this PR introduce any user-facing change?

Yes, adding a new DDL statement.

### How was this patch tested?

More tests to be added.

Closes #27482 from imback82/replace_cols.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  3 ++
 .../sql/catalyst/analysis/CheckAnalysis.scala | 12 ++++-
 .../catalyst/analysis/ResolveCatalogs.scala   | 21 ++++++++
 .../sql/catalyst/parser/AstBuilder.scala      | 21 ++++++++
 .../catalyst/plans/logical/statements.scala   |  4 ++
 .../sql/catalyst/parser/DDLParserSuite.scala  | 48 ++++++++++++++++++-
 .../analysis/ResolveSessionCatalog.scala      | 23 +++++++++
 .../spark/sql/connector/AlterTableTests.scala | 15 ++++++
 8 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 563ef69b3b8ae..2bc71476aba02 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -164,6 +164,9 @@ statement
     | ALTER TABLE table=multipartIdentifier partitionSpec?
         CHANGE COLUMN?
         colName=multipartIdentifier colType colPosition?               #hiveChangeColumn
+    | ALTER TABLE table=multipartIdentifier partitionSpec?
+        REPLACE COLUMNS
+        '(' columns=qualifiedColTypeWithPositionList ')'               #hiveReplaceColumns
     | ALTER TABLE multipartIdentifier (partitionSpec)?
         SET SERDE STRING (WITH SERDEPROPERTIES tablePropertyList)?     #setTableSerDe
     | ALTER TABLE multipartIdentifier (partitionSpec)?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index e769e038c960f..67c509ed98245 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -473,9 +473,15 @@ trait CheckAnalysis extends PredicateHelper {
               }
             }
 
+            val colsToDelete = mutable.Set.empty[Seq[String]]
+
             alter.changes.foreach {
               case add: AddColumn =>
-                checkColumnNotExists("add", add.fieldNames(), table.schema)
+                // If a column to add is a part of columns to delete, we don't need to check
+                // if column already exists - applies to REPLACE COLUMNS scenario.
+                if (!colsToDelete.contains(add.fieldNames())) {
+                  checkColumnNotExists("add", add.fieldNames(), table.schema)
+                }
                 val parent = findParentStruct("add", add.fieldNames())
                 positionArgumentExists(add.position(), parent)
                 TypeUtils.failWithIntervalType(add.dataType())
@@ -526,6 +532,10 @@ trait CheckAnalysis extends PredicateHelper {
                 findField("update", update.fieldNames)
               case delete: DeleteColumn =>
                 findField("delete", delete.fieldNames)
+                // REPLACE COLUMNS has deletes followed by adds. Remember the deleted columns
+                // so that add operations do not fail when the columns to add exist and they
+                // are to be deleted.
+                colsToDelete += delete.fieldNames
               case _ =>
               // no validation needed for set and remove property
             }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
index 88a3c0a73a10b..96558410d4004 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
@@ -44,6 +44,27 @@ class ResolveCatalogs(val catalogManager: CatalogManager)
       }
       createAlterTable(nameParts, catalog, tbl, changes)
 
+    case AlterTableReplaceColumnsStatement(
+        nameParts @ NonSessionCatalogAndTable(catalog, tbl), cols) =>
+      val changes: Seq[TableChange] = loadTable(catalog, tbl.asIdentifier) match {
+        case Some(table) =>
+          // REPLACE COLUMNS deletes all the existing columns and adds new columns specified.
+          val deleteChanges = table.schema.fieldNames.map { name =>
+            TableChange.deleteColumn(Array(name))
+          }
+          val addChanges = cols.map { col =>
+            TableChange.addColumn(
+              col.name.toArray,
+              col.dataType,
+              col.nullable,
+              col.comment.orNull,
+              col.position.orNull)
+          }
+          deleteChanges ++ addChanges
+        case None => Seq()
+      }
+      createAlterTable(nameParts, catalog, tbl, changes)
+
     case a @ AlterTableAlterColumnStatement(
          nameParts @ NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _) =>
       val colName = a.column.toArray
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 62e568587fcc6..b3541a7f7374d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -3030,6 +3030,27 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       position = Option(ctx.colPosition).map(typedVisit[ColumnPosition]))
   }
 
+  override def visitHiveReplaceColumns(
+      ctx: HiveReplaceColumnsContext): LogicalPlan = withOrigin(ctx) {
+    if (ctx.partitionSpec != null) {
+      operationNotAllowed("ALTER TABLE table PARTITION partition_spec REPLACE COLUMNS", ctx)
+    }
+    AlterTableReplaceColumnsStatement(
+      visitMultipartIdentifier(ctx.multipartIdentifier),
+      ctx.columns.qualifiedColTypeWithPosition.asScala.map { colType =>
+        if (colType.NULL != null) {
+          throw new AnalysisException(
+            "NOT NULL is not supported in Hive-style REPLACE COLUMNS")
+        }
+        if (colType.colPosition != null) {
+          throw new AnalysisException(
+            "Column position is not supported in Hive-style REPLACE COLUMNS")
+        }
+        typedVisit[QualifiedColType](colType)
+      }
+    )
+  }
+
   /**
    * Parse a [[AlterTableDropColumnsStatement]] command.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
index 1e6b67bf78b70..6731214d3842d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
@@ -156,6 +156,10 @@ case class AlterTableAddColumnsStatement(
     tableName: Seq[String],
     columnsToAdd: Seq[QualifiedColType]) extends ParsedStatement
 
+case class AlterTableReplaceColumnsStatement(
+    tableName: Seq[String],
+    columnsToAdd: Seq[QualifiedColType]) extends ParsedStatement
+
 /**
  * ALTER TABLE ... CHANGE COLUMN command, as parsed from SQL.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
index bc7b51f25b20d..049f56c8c9ce1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -699,7 +699,7 @@ class DDLParserSuite extends AnalysisTest {
     }
   }
 
-  test("alter table: hive style") {
+  test("alter table: hive style change column") {
     val sql1 = "ALTER TABLE table_name CHANGE COLUMN a.b.c c INT"
     val sql2 = "ALTER TABLE table_name CHANGE COLUMN a.b.c c INT COMMENT 'new_comment'"
     val sql3 = "ALTER TABLE table_name CHANGE COLUMN a.b.c c INT AFTER other_col"
@@ -742,6 +742,52 @@ class DDLParserSuite extends AnalysisTest {
     intercept("ALTER TABLE table_name PARTITION (a='1') CHANGE COLUMN a.b.c c INT")
   }
 
+  test("alter table: hive style replace columns") {
+    val sql1 = "ALTER TABLE table_name REPLACE COLUMNS (x string)"
+    val sql2 = "ALTER TABLE table_name REPLACE COLUMNS (x string COMMENT 'x1')"
+    val sql3 = "ALTER TABLE table_name REPLACE COLUMNS (x string COMMENT 'x1', y int)"
+    val sql4 = "ALTER TABLE table_name REPLACE COLUMNS (x string COMMENT 'x1', y int COMMENT 'y1')"
+
+    comparePlans(
+      parsePlan(sql1),
+      AlterTableReplaceColumnsStatement(
+        Seq("table_name"),
+        Seq(QualifiedColType(Seq("x"), StringType, true, None, None))))
+
+    comparePlans(
+      parsePlan(sql2),
+      AlterTableReplaceColumnsStatement(
+        Seq("table_name"),
+        Seq(QualifiedColType(Seq("x"), StringType, true, Some("x1"), None))))
+
+    comparePlans(
+      parsePlan(sql3),
+      AlterTableReplaceColumnsStatement(
+        Seq("table_name"),
+        Seq(
+          QualifiedColType(Seq("x"), StringType, true, Some("x1"), None),
+          QualifiedColType(Seq("y"), IntegerType, true, None, None)
+        )))
+
+    comparePlans(
+      parsePlan(sql4),
+      AlterTableReplaceColumnsStatement(
+        Seq("table_name"),
+        Seq(
+          QualifiedColType(Seq("x"), StringType, true, Some("x1"), None),
+          QualifiedColType(Seq("y"), IntegerType, true, Some("y1"), None)
+        )))
+
+    intercept("ALTER TABLE table_name PARTITION (a='1') REPLACE COLUMNS (x string)",
+      "Operation not allowed: ALTER TABLE table PARTITION partition_spec REPLACE COLUMNS")
+
+    intercept("ALTER TABLE table_name REPLACE COLUMNS (x string NOT NULL)",
+      "NOT NULL is not supported in Hive-style REPLACE COLUMNS")
+
+    intercept("ALTER TABLE table_name REPLACE COLUMNS (x string FIRST)",
+      "Column position is not supported in Hive-style REPLACE COLUMNS")
+  }
+
   test("alter table/view: rename table/view") {
     comparePlans(
       parsePlan("ALTER TABLE a.b.c RENAME TO x.y.z"),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
index 77d549c28aae5..adeb2164eff63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -69,6 +69,29 @@ class ResolveSessionCatalog(
         createAlterTable(nameParts, catalog, tbl, changes)
       }
 
+    case AlterTableReplaceColumnsStatement(
+        nameParts @ SessionCatalogAndTable(catalog, tbl), cols) =>
+      val changes: Seq[TableChange] = loadTable(catalog, tbl.asIdentifier) match {
+        case Some(_: V1Table) =>
+          throw new AnalysisException("REPLACE COLUMNS is only supported with v2 tables.")
+        case Some(table) =>
+          // REPLACE COLUMNS deletes all the existing columns and adds new columns specified.
+          val deleteChanges = table.schema.fieldNames.map { name =>
+            TableChange.deleteColumn(Array(name))
+          }
+          val addChanges = cols.map { col =>
+            TableChange.addColumn(
+              col.name.toArray,
+              col.dataType,
+              col.nullable,
+              col.comment.orNull,
+              col.position.orNull)
+          }
+          deleteChanges ++ addChanges
+        case None => Seq() // Unresolved table will be handled in CheckAnalysis.
+      }
+      createAlterTable(nameParts, catalog, tbl, changes)
+
     case a @ AlterTableAlterColumnStatement(
          nameParts @ SessionCatalogAndTable(catalog, tbl), _, _, _, _, _) =>
       loadTable(catalog, tbl.asIdentifier).collect {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
index 420cb01d766a0..96fe301b512ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
@@ -1060,4 +1060,19 @@ trait AlterTableTests extends SharedSparkSession {
       assert(updated.properties === withDefaultOwnership(Map("provider" -> v2Format)).asJava)
     }
   }
+
+  test("AlterTable: replace columns") {
+    val t = s"${catalogAndNamespace}table_name"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (col1 int, col2 int COMMENT 'c2') USING $v2Format")
+      sql(s"ALTER TABLE $t REPLACE COLUMNS (col2 string, col3 int COMMENT 'c3')")
+
+      val table = getTableMetadata(t)
+
+      assert(table.name === fullTableName(t))
+      assert(table.schema === StructType(Seq(
+        StructField("col2", StringType),
+        StructField("col3", IntegerType).withComment("c3"))))
+    }
+  }
 }

From 04604b9899cc43a9726d671061ff305912fdb85f Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 13 Feb 2020 22:06:24 +0800
Subject: [PATCH 033/185] [SPARK-30758][SQL][TESTS] Improve bracketed comments
 tests

### What changes were proposed in this pull request?
Although Spark SQL support bracketed comments, but `SQLQueryTestSuite` can't treat bracketed comments well and lead to generated golden files can't display bracketed comments well.
This PR will improve the treatment of bracketed comments and add three test case in `PlanParserSuite`.
Spark SQL can't support nested bracketed comments and https://github.com/apache/spark/pull/27495 used to support it.

### Why are the changes needed?
Golden files can't display well.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
New UT.

Closes #27481 from beliefer/ansi-brancket-comments.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql-tests/inputs/postgreSQL/comments.sql  |   6 +-
 .../results/postgreSQL/comments.sql.out       | 137 ++++--------------
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  51 ++++++-
 3 files changed, 78 insertions(+), 116 deletions(-)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
index 6725ce45e72a5..1a454179ef79f 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
@@ -11,17 +11,19 @@ SELECT /* embedded single line */ 'embedded' AS `second`;
 SELECT /* both embedded and trailing single line */ 'both' AS third; -- trailing single line
 
 SELECT 'before multi-line' AS fourth;
+--QUERY-DELIMITER-START
 -- [SPARK-28880] ANSI SQL: Bracketed comments
 /* This is an example of SQL which should not execute:
  * select 'multi-line';
  */
 SELECT 'after multi-line' AS fifth;
+--QUERY-DELIMITER-END
 
 -- [SPARK-28880] ANSI SQL: Bracketed comments
 --
 -- Nested comments
 --
-
+--QUERY-DELIMITER-START
 /*
 SELECT 'trailing' as x1; -- inside block comment
 */
@@ -44,5 +46,5 @@ Hoo boy. Still two deep...
 Now just one deep...
 */
 'deeply nested example' AS sixth;
-
+--QUERY-DELIMITER-END
 /* and this is the end of the file */
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
index 4ea49013a62d1..637c5561bd940 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 7
 
 
 -- !query
@@ -36,129 +36,32 @@ before multi-line
 
 -- !query
 /* This is an example of SQL which should not execute:
- * select 'multi-line'
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-/* This is an example of SQL which should not execute:
-^^^
- * select 'multi-line'
-
-
--- !query
-*/
+ * select 'multi-line';
+ */
 SELECT 'after multi-line' AS fifth
 -- !query schema
-struct<>
+struct<fifth:string>
 -- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-extraneous input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-*/
-^^^
-SELECT 'after multi-line' AS fifth
+after multi-line
 
 
 -- !query
 /*
-SELECT 'trailing' as x1
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-/*
-^^^
-SELECT 'trailing' as x1
-
-
--- !query
+SELECT 'trailing' as x1; -- inside block comment
 */
 
 /* This block comment surrounds a query which itself has a block comment...
-SELECT /* embedded single line */ 'embedded' AS x2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-*/
-^^^
-
-/* This block comment surrounds a query which itself has a block comment...
-SELECT /* embedded single line */ 'embedded' AS x2
-
-
--- !query
+SELECT /* embedded single line */ 'embedded' AS x2;
 */
 
 SELECT -- continued after the following block comments...
 /* Deeply nested comment.
    This includes a single apostrophe to make sure we aren't decoding this part as a string.
-SELECT 'deep nest' AS n1
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-extraneous input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-*/
-^^^
-
-SELECT -- continued after the following block comments...
-/* Deeply nested comment.
-   This includes a single apostrophe to make sure we aren't decoding this part as a string.
-SELECT 'deep nest' AS n1
-
-
--- !query
+SELECT 'deep nest' AS n1;
 /* Second level of nesting...
-SELECT 'deeper nest' as n2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-/* Second level of nesting...
-^^^
-SELECT 'deeper nest' as n2
-
-
--- !query
+SELECT 'deeper nest' as n2;
 /* Third level of nesting...
-SELECT 'deepest nest' as n3
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
-
-== SQL ==
-/* Third level of nesting...
-^^^
-SELECT 'deepest nest' as n3
-
-
--- !query
+SELECT 'deepest nest' as n3;
 */
 Hoo boy. Still two deep...
 */
@@ -170,11 +73,27 @@ struct<>
 -- !query output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-mismatched input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)
+mismatched input ''embedded'' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 6, pos 34)
 
 == SQL ==
+/*
+SELECT 'trailing' as x1; -- inside block comment
+*/
+
+/* This block comment surrounds a query which itself has a block comment...
+SELECT /* embedded single line */ 'embedded' AS x2;
+----------------------------------^^^
+*/
+
+SELECT -- continued after the following block comments...
+/* Deeply nested comment.
+   This includes a single apostrophe to make sure we aren't decoding this part as a string.
+SELECT 'deep nest' AS n1;
+/* Second level of nesting...
+SELECT 'deeper nest' as n2;
+/* Third level of nesting...
+SELECT 'deepest nest' as n3;
 */
-^^^
 Hoo boy. Still two deep...
 */
 Now just one deep...
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 6b9e5bbd3c961..da4727f6a98cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql
 
 import java.io.File
 import java.util.{Locale, TimeZone}
+import java.util.regex.Pattern
 
+import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
 import org.apache.spark.{SparkConf, SparkException}
@@ -62,7 +64,12 @@ import org.apache.spark.tags.ExtendedSQLTest
  * }}}
  *
  * The format for input files is simple:
- *  1. A list of SQL queries separated by semicolon.
+ *  1. A list of SQL queries separated by semicolons by default. If the semicolon cannot effectively
+ *     separate the SQL queries in the test file(e.g. bracketed comments), please use
+ *     --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with
+ *     --QUERY-DELIMITER-START and --QUERY-DELIMITER-END represent the beginning and end of a query,
+ *     respectively. Code that is not surrounded by lines that begin with --QUERY-DELIMITER-START
+ *     and --QUERY-DELIMITER-END is still separated by semicolons.
  *  2. Lines starting with -- are treated as comments and ignored.
  *  3. Lines starting with --SET are used to specify the configs when running this testing file. You
  *     can set multiple configs in one --SET, using comma to separate them. Or you can use multiple
@@ -246,9 +253,15 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
 
   /** Run a test case. */
   protected def runTest(testCase: TestCase): Unit = {
+    def splitWithSemicolon(seq: Seq[String]) = {
+      seq.mkString("\n").split("(?<=[^\\\\]);")
+    }
     val input = fileToString(new File(testCase.inputFile))
 
-    val (comments, code) = input.split("\n").partition(_.trim.startsWith("--"))
+    val (comments, code) = input.split("\n").partition { line =>
+      val newLine = line.trim
+      newLine.startsWith("--") && !newLine.startsWith("--QUERY-DELIMITER")
+    }
 
     // If `--IMPORT` found, load code from another test case file, then insert them
     // into the head in this test.
@@ -261,10 +274,38 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
       }
     }.flatten
 
+    val allCode = importedCode ++ code
+    val tempQueries = if (allCode.exists(_.trim.startsWith("--QUERY-DELIMITER"))) {
+      // Although the loop is heavy, only used for bracketed comments test.
+      val querys = new ArrayBuffer[String]
+      val otherCodes = new ArrayBuffer[String]
+      var tempStr = ""
+      var start = false
+      for (c <- allCode) {
+        if (c.trim.startsWith("--QUERY-DELIMITER-START")) {
+          start = true
+          querys ++= splitWithSemicolon(otherCodes.toSeq)
+          otherCodes.clear()
+        } else if (c.trim.startsWith("--QUERY-DELIMITER-END")) {
+          start = false
+          querys += s"\n${tempStr.stripSuffix(";")}"
+          tempStr = ""
+        } else if (start) {
+          tempStr += s"\n$c"
+        } else {
+          otherCodes += c
+        }
+      }
+      if (otherCodes.nonEmpty) {
+        querys ++= splitWithSemicolon(otherCodes.toSeq)
+      }
+      querys.toSeq
+    } else {
+      splitWithSemicolon(allCode).toSeq
+    }
+
     // List of SQL queries to run
-    // note: this is not a robust way to split queries using semicolon, but works for now.
-    val queries = (importedCode ++ code).mkString("\n").split("(?<=[^\\\\]);")
-      .map(_.trim).filter(_ != "").toSeq
+    val queries = tempQueries.map(_.trim).filter(_ != "").toSeq
       // Fix misplacement when comment is at the end of the query.
       .map(_.split("\n").filterNot(_.startsWith("--")).mkString("\n")).map(_.trim).filter(_ != "")
 

From fb0e07b08ccaeda50a5121bcb1fab69a1ff749c4 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Thu, 13 Feb 2020 22:48:27 +0800
Subject: [PATCH 034/185] [SPARK-29231][SQL] Constraints should be inferred
 from cast equality constraint

### What changes were proposed in this pull request?

This PR add support infer constraints from cast equality constraint. For example:
```scala
scala> spark.sql("create table spark_29231_1(c1 bigint, c2 bigint)")
res0: org.apache.spark.sql.DataFrame = []

scala> spark.sql("create table spark_29231_2(c1 int, c2 bigint)")
res1: org.apache.spark.sql.DataFrame = []

scala> spark.sql("select t1.* from spark_29231_1 t1 join spark_29231_2 t2 on (t1.c1 = t2.c1 and t1.c1 = 1)").explain
== Physical Plan ==
*(2) Project [c1#5L, c2#6L]
+- *(2) BroadcastHashJoin [c1#5L], [cast(c1#7 as bigint)], Inner, BuildRight
   :- *(2) Project [c1#5L, c2#6L]
   :  +- *(2) Filter (isnotnull(c1#5L) AND (c1#5L = 1))
   :     +- *(2) ColumnarToRow
   :        +- FileScan parquet default.spark_29231_1[c1#5L,c2#6L] Batched: true, DataFilters: [isnotnull(c1#5L), (c1#5L = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/root/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehouse/spark_29231_1], PartitionFilters: [], PushedFilters: [IsNotNull(c1), EqualTo(c1,1)], ReadSchema: struct<c1:bigint,c2:bigint>
   +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#209]
      +- *(1) Project [c1#7]
         +- *(1) Filter isnotnull(c1#7)
            +- *(1) ColumnarToRow
               +- FileScan parquet default.spark_29231_2[c1#7] Batched: true, DataFilters: [isnotnull(c1#7)], Format: Parquet, Location: InMemoryFileIndex[file:/root/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehouse/spark_29231_2], PartitionFilters: [], PushedFilters: [IsNotNull(c1)], ReadSchema: struct<c1:int>
```

After this PR:
```scala
scala> spark.sql("select t1.* from spark_29231_1 t1 join spark_29231_2 t2 on (t1.c1 = t2.c1 and t1.c1 = 1)").explain
== Physical Plan ==
*(2) Project [c1#0L, c2#1L]
+- *(2) BroadcastHashJoin [c1#0L], [cast(c1#2 as bigint)], Inner, BuildRight
   :- *(2) Project [c1#0L, c2#1L]
   :  +- *(2) Filter (isnotnull(c1#0L) AND (c1#0L = 1))
   :     +- *(2) ColumnarToRow
   :        +- FileScan parquet default.spark_29231_1[c1#0L,c2#1L] Batched: true, DataFilters: [isnotnull(c1#0L), (c1#0L = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/root/opensource/spark/spark-warehouse/spark_29231_1], PartitionFilters: [], PushedFilters: [IsNotNull(c1), EqualTo(c1,1)], ReadSchema: struct<c1:bigint,c2:bigint>
   +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#99]
      +- *(1) Project [c1#2]
         +- *(1) Filter ((cast(c1#2 as bigint) = 1) AND isnotnull(c1#2))
            +- *(1) ColumnarToRow
               +- FileScan parquet default.spark_29231_2[c1#2] Batched: true, DataFilters: [(cast(c1#2 as bigint) = 1), isnotnull(c1#2)], Format: Parquet, Location: InMemoryFileIndex[file:/root/opensource/spark/spark-warehouse/spark_29231_2], PartitionFilters: [], PushedFilters: [IsNotNull(c1)], ReadSchema: struct<c1:int>
```

### Why are the changes needed?

Improve query performance.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Unit test.

Closes #27252 from wangyum/SPARK-29231.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../plans/logical/QueryPlanConstraints.scala  | 12 +++-
 .../InferFiltersFromConstraintsSuite.scala    | 57 ++++++++++++++++++-
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index 1355003358b9f..4c4ec000d0930 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -62,11 +62,17 @@ trait ConstraintHelper {
    */
   def inferAdditionalConstraints(constraints: Set[Expression]): Set[Expression] = {
     var inferredConstraints = Set.empty[Expression]
-    constraints.foreach {
+    // IsNotNull should be constructed by `constructIsNotNullConstraints`.
+    val predicates = constraints.filterNot(_.isInstanceOf[IsNotNull])
+    predicates.foreach {
       case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        val candidateConstraints = constraints - eq
+        val candidateConstraints = predicates - eq
         inferredConstraints ++= replaceConstraints(candidateConstraints, l, r)
         inferredConstraints ++= replaceConstraints(candidateConstraints, r, l)
+      case eq @ EqualTo(l @ Cast(_: Attribute, _, _), r: Attribute) =>
+        inferredConstraints ++= replaceConstraints(predicates - eq, r, l)
+      case eq @ EqualTo(l: Attribute, r @ Cast(_: Attribute, _, _)) =>
+        inferredConstraints ++= replaceConstraints(predicates - eq, l, r)
       case _ => // No inference
     }
     inferredConstraints -- constraints
@@ -75,7 +81,7 @@ trait ConstraintHelper {
   private def replaceConstraints(
       constraints: Set[Expression],
       source: Expression,
-      destination: Attribute): Set[Expression] = constraints.map(_ transform {
+      destination: Expression): Set[Expression] = constraints.map(_ transform {
     case e: Expression if e.semanticEquals(source) => destination
   })
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index 974bc781d36ab..79bd573f1d84a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{IntegerType, LongType}
 
 class InferFiltersFromConstraintsSuite extends PlanTest {
 
@@ -46,8 +47,8 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
       y: LogicalPlan,
       expectedLeft: LogicalPlan,
       expectedRight: LogicalPlan,
-      joinType: JoinType) = {
-    val condition = Some("x.a".attr === "y.a".attr)
+      joinType: JoinType,
+      condition: Option[Expression] = Some("x.a".attr === "y.a".attr)) = {
     val originalQuery = x.join(y, joinType, condition).analyze
     val correctAnswer = expectedLeft.join(expectedRight, joinType, condition).analyze
     val optimized = Optimize.execute(originalQuery)
@@ -263,4 +264,56 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
     val y = testRelation.subquery('y)
     testConstraintsAfterJoin(x, y, x.where(IsNotNull('a)), y, RightOuter)
   }
+
+  test("Constraints should be inferred from cast equality constraint(filter higher data type)") {
+    val testRelation1 = LocalRelation('a.int)
+    val testRelation2 = LocalRelation('b.long)
+    val originalLeft = testRelation1.subquery('left)
+    val originalRight = testRelation2.where('b === 1L).subquery('right)
+
+    val left = testRelation1.where(IsNotNull('a) && 'a.cast(LongType) === 1L).subquery('left)
+    val right = testRelation2.where(IsNotNull('b) && 'b === 1L).subquery('right)
+
+    Seq(Some("left.a".attr.cast(LongType) === "right.b".attr),
+      Some("right.b".attr === "left.a".attr.cast(LongType))).foreach { condition =>
+      testConstraintsAfterJoin(originalLeft, originalRight, left, right, Inner, condition)
+    }
+
+    Seq(Some("left.a".attr === "right.b".attr.cast(IntegerType)),
+      Some("right.b".attr.cast(IntegerType) === "left.a".attr)).foreach { condition =>
+      testConstraintsAfterJoin(
+        originalLeft,
+        originalRight,
+        testRelation1.where(IsNotNull('a)).subquery('left),
+        right,
+        Inner,
+        condition)
+    }
+  }
+
+  test("Constraints shouldn't be inferred from cast equality constraint(filter lower data type)") {
+    val testRelation1 = LocalRelation('a.int)
+    val testRelation2 = LocalRelation('b.long)
+    val originalLeft = testRelation1.where('a === 1).subquery('left)
+    val originalRight = testRelation2.subquery('right)
+
+    val left = testRelation1.where(IsNotNull('a) && 'a === 1).subquery('left)
+    val right = testRelation2.where(IsNotNull('b)).subquery('right)
+
+    Seq(Some("left.a".attr.cast(LongType) === "right.b".attr),
+      Some("right.b".attr === "left.a".attr.cast(LongType))).foreach { condition =>
+      testConstraintsAfterJoin(originalLeft, originalRight, left, right, Inner, condition)
+    }
+
+    Seq(Some("left.a".attr === "right.b".attr.cast(IntegerType)),
+      Some("right.b".attr.cast(IntegerType) === "left.a".attr)).foreach { condition =>
+      testConstraintsAfterJoin(
+        originalLeft,
+        originalRight,
+        left,
+        testRelation2.where(IsNotNull('b) && 'b.attr.cast(IntegerType) === 1).subquery('right),
+        Inner,
+        condition)
+    }
+  }
 }

From 82d0aa37ae521231d8067e473c6ea79a118a115a Mon Sep 17 00:00:00 2001
From: Liang Zhang <liang.zhang@databricks.com>
Date: Thu, 13 Feb 2020 23:55:13 +0800
Subject: [PATCH 035/185] [SPARK-30762] Add dtype=float32 support to
 vector_to_array UDF

### What changes were proposed in this pull request?
In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32.

### Why are the changes needed?
In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory.

### Does this PR introduce any user-facing change?
Yes.
Old: `vector_to_array()` only take one param
```
df.select(vector_to_array("colA"), ...)
```
New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64")
```
df.select(vector_to_array("colA", "float32"), ...)
```

### How was this patch tested?
Unit test in scala.
doctest in python.

Closes #27522 from liangz1/udf-float32.

Authored-by: Liang Zhang <liang.zhang@databricks.com>
Signed-off-by: WeichenXu <weichen.xu@databricks.com>
---
 .../scala/org/apache/spark/ml/functions.scala | 34 ++++++++++++++++---
 .../org/apache/spark/ml/FunctionsSuite.scala  | 33 +++++++++++++++---
 python/pyspark/ml/functions.py                | 27 ++++++++++++---
 3 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/functions.scala b/mllib/src/main/scala/org/apache/spark/ml/functions.scala
index 1faf562c4d896..0f03231079866 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/functions.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/functions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{SparseVector, Vector}
 import org.apache.spark.mllib.linalg.{Vector => OldVector}
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.functions.udf
@@ -27,7 +27,6 @@ import org.apache.spark.sql.functions.udf
 @Since("3.0.0")
 object functions {
 // scalastyle:on
-
   private val vectorToArrayUdf = udf { vec: Any =>
     vec match {
       case v: Vector => v.toArray
@@ -39,10 +38,37 @@ object functions {
     }
   }.asNonNullable()
 
+  private val vectorToArrayFloatUdf = udf { vec: Any =>
+    vec match {
+      case v: SparseVector =>
+        val data = new Array[Float](v.size)
+        v.foreachActive { (index, value) => data(index) = value.toFloat }
+        data
+      case v: Vector => v.toArray.map(_.toFloat)
+      case v: OldVector => v.toArray.map(_.toFloat)
+      case v => throw new IllegalArgumentException(
+        "function vector_to_array requires a non-null input argument and input type must be " +
+        "`org.apache.spark.ml.linalg.Vector` or `org.apache.spark.mllib.linalg.Vector`, " +
+        s"but got ${ if (v == null) "null" else v.getClass.getName }.")
+    }
+  }.asNonNullable()
+
   /**
    * Converts a column of MLlib sparse/dense vectors into a column of dense arrays.
-   *
+   * @param v: the column of MLlib sparse/dense vectors
+   * @param dtype: the desired underlying data type in the returned array
+   * @return an array&lt;float&gt; if dtype is float32, or array&lt;double&gt; if dtype is float64
    * @since 3.0.0
    */
-  def vector_to_array(v: Column): Column = vectorToArrayUdf(v)
+  def vector_to_array(v: Column, dtype: String = "float64"): Column = {
+    if (dtype == "float64") {
+      vectorToArrayUdf(v)
+    } else if (dtype == "float32") {
+      vectorToArrayFloatUdf(v)
+    } else {
+      throw new IllegalArgumentException(
+        s"Unsupported dtype: $dtype. Valid values: float64, float32."
+      )
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
index 2f5062c689fc7..3dd9a7d8ec85d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
@@ -34,9 +34,8 @@ class FunctionsSuite extends MLTest {
       (Vectors.sparse(3, Seq((0, 2.0), (2, 3.0))), OldVectors.sparse(3, Seq((0, 20.0), (2, 30.0))))
     ).toDF("vec", "oldVec")
 
-      val result = df.select(vector_to_array('vec), vector_to_array('oldVec))
-      .as[(Seq[Double], Seq[Double])]
-      .collect().toSeq
+    val result = df.select(vector_to_array('vec), vector_to_array('oldVec))
+                   .as[(Seq[Double], Seq[Double])].collect().toSeq
 
     val expected = Seq(
       (Seq(1.0, 2.0, 3.0), Seq(10.0, 20.0, 30.0)),
@@ -50,7 +49,6 @@ class FunctionsSuite extends MLTest {
       (null, null, 0)
     ).toDF("vec", "oldVec", "label")
 
-
     for ((colName, valType) <- Seq(
         ("vec", "null"), ("oldVec", "null"), ("label", "java.lang.Integer"))) {
       val thrown1 = intercept[SparkException] {
@@ -61,5 +59,32 @@ class FunctionsSuite extends MLTest {
         "`org.apache.spark.ml.linalg.Vector` or `org.apache.spark.mllib.linalg.Vector`, " +
         s"but got ${valType}"))
     }
+
+    val df3 = Seq(
+      (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
+      (Vectors.sparse(3, Seq((0, 2.0), (2, 3.0))), OldVectors.sparse(3, Seq((0, 20.0), (2, 30.0))))
+    ).toDF("vec", "oldVec")
+    val dfArrayFloat = df3.select(
+      vector_to_array('vec, dtype = "float32"), vector_to_array('oldVec, dtype = "float32"))
+
+    // Check values are correct
+    val result3 = dfArrayFloat.as[(Seq[Float], Seq[Float])].collect().toSeq
+
+    val expected3 = Seq(
+      (Seq(1.0, 2.0, 3.0), Seq(10.0, 20.0, 30.0)),
+      (Seq(2.0, 0.0, 3.0), Seq(20.0, 0.0, 30.0))
+    )
+    assert(result3 === expected3)
+
+    // Check data types are correct
+    assert(dfArrayFloat.schema.simpleString ===
+      "struct<UDF(vec):array<float>,UDF(oldVec):array<float>>")
+
+    val thrown2 = intercept[IllegalArgumentException] {
+      df3.select(
+        vector_to_array('vec, dtype = "float16"), vector_to_array('oldVec, dtype = "float16"))
+    }
+    assert(thrown2.getMessage.contains(
+      s"Unsupported dtype: float16. Valid values: float64, float32."))
   }
 }
diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py
index 2b4d8ddcd00a8..ec164f34bc4db 100644
--- a/python/pyspark/ml/functions.py
+++ b/python/pyspark/ml/functions.py
@@ -19,10 +19,15 @@
 from pyspark.sql.column import Column, _to_java_column
 
 
-@since(3.0)
-def vector_to_array(col):
+@since("3.0.0")
+def vector_to_array(col, dtype="float64"):
     """
     Converts a column of MLlib sparse/dense vectors into a column of dense arrays.
+    :param col: A string of the column name or a Column
+    :param dtype: The data type of the output array. Valid values: "float64" or "float32".
+    :return: The converted column of dense arrays.
+
+    .. versionadded:: 3.0.0
 
     >>> from pyspark.ml.linalg import Vectors
     >>> from pyspark.ml.functions import vector_to_array
@@ -32,14 +37,26 @@ def vector_to_array(col):
     ...     (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),
     ...      OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],
     ...     ["vec", "oldVec"])
-    >>> df.select(vector_to_array("vec").alias("vec"),
-    ...           vector_to_array("oldVec").alias("oldVec")).collect()
+    >>> df1 = df.select(vector_to_array("vec").alias("vec"),
+    ...                 vector_to_array("oldVec").alias("oldVec"))
+    >>> df1.collect()
+    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
+     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
+    >>> df2 = df.select(vector_to_array("vec", "float32").alias("vec"),
+    ...                 vector_to_array("oldVec", "float32").alias("oldVec"))
+    >>> df2.collect()
     [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
      Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
+    >>> df1.schema.fields
+    [StructField(vec,ArrayType(DoubleType,false),false),
+    StructField(oldVec,ArrayType(DoubleType,false),false)]
+    >>> df2.schema.fields
+    [StructField(vec,ArrayType(FloatType,false),false),
+    StructField(oldVec,ArrayType(FloatType,false),false)]
     """
     sc = SparkContext._active_spark_context
     return Column(
-        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col)))
+        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype))
 
 
 def _test():

From 3c4044ea77fe3b1268b52744cd4f1ae61f17a9a8 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 13 Feb 2020 10:53:55 -0800
Subject: [PATCH 036/185] [SPARK-30703][SQL][DOCS] Add a document for the ANSI
 mode

### What changes were proposed in this pull request?

This pr intends to add a document for the ANSI mode;

<img width="600" alt="Screen Shot 2020-02-13 at 8 08 52" src="https://user-images.githubusercontent.com/692303/74386041-5934f780-4e38-11ea-8162-26e524e11c65.png">
<img width="600" alt="Screen Shot 2020-02-13 at 8 09 13" src="https://user-images.githubusercontent.com/692303/74386040-589c6100-4e38-11ea-8a64-899788eaf55f.png">
<img width="600" alt="Screen Shot 2020-02-13 at 8 09 26" src="https://user-images.githubusercontent.com/692303/74386039-5803ca80-4e38-11ea-949f-049208d2203d.png">
<img width="600" alt="Screen Shot 2020-02-13 at 8 09 38" src="https://user-images.githubusercontent.com/692303/74386036-563a0700-4e38-11ea-9ec3-87a8f6771cf0.png">

### Why are the changes needed?

For better document coverage and usability.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27489 from maropu/SPARK-30703.

Authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
---
 docs/_data/menu-sql.yaml                      |  11 +-
 ...keywords.md => sql-ref-ansi-compliance.md} | 125 +++++++++++++++++-
 docs/sql-ref-arithmetic-ops.md                |  22 ---
 3 files changed, 132 insertions(+), 26 deletions(-)
 rename docs/{sql-keywords.md => sql-ref-ansi-compliance.md} (82%)
 delete mode 100644 docs/sql-ref-arithmetic-ops.md

diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
index 241ec399d7bd5..1e343f630f88e 100644
--- a/docs/_data/menu-sql.yaml
+++ b/docs/_data/menu-sql.yaml
@@ -80,6 +80,15 @@
       url: sql-ref-null-semantics.html
     - text: NaN Semantics
       url: sql-ref-nan-semantics.html
+    - text: ANSI Compliance
+      url: sql-ref-ansi-compliance.html
+      subitems:
+        - text: Arithmetic Operations
+          url: sql-ref-ansi-compliance.html#arithmetic-operations
+        - text: Type Conversion
+          url: sql-ref-ansi-compliance.html#type-conversion
+        - text: SQL Keywords
+          url: sql-ref-ansi-compliance.html#sql-keywords
     - text: SQL Syntax
       url: sql-ref-syntax.html
       subitems:
@@ -214,5 +223,3 @@
                   url: sql-ref-syntax-aux-resource-mgmt-list-file.html
                 - text: LIST JAR
                   url: sql-ref-syntax-aux-resource-mgmt-list-jar.html
-    - text: Arithmetic operations
-      url: sql-ref-arithmetic-ops.html
diff --git a/docs/sql-keywords.md b/docs/sql-ref-ansi-compliance.md
similarity index 82%
rename from docs/sql-keywords.md
rename to docs/sql-ref-ansi-compliance.md
index 9e4a3c54100c6..d02383518b967 100644
--- a/docs/sql-keywords.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Spark SQL Keywords
-displayTitle: Spark SQL Keywords
+title: ANSI Compliance
+displayTitle: ANSI Compliance
 license: |
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
@@ -19,6 +19,127 @@ license: |
   limitations under the License.
 ---
 
+Spark SQL has two options to comply with the SQL standard: `spark.sql.ansi.enabled` and `spark.sql.storeAssignmentPolicy` (See a table below for details).
+When `spark.sql.ansi.enabled` is set to `true`, Spark SQL follows the standard in basic behaviours (e.g., arithmetic operations, type conversion, and SQL parsing).
+Moreover, Spark SQL has an independent option to control implicit casting behaviours when inserting rows in a table.
+The casting behaviours are defined as store assignment rules in the standard.
+When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with the ANSI store assignment rules.
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.sql.ansi.enabled</code></td>
+  <td>false</td>
+  <td>
+    When true, Spark tries to conform to the ANSI SQL specification:
+    1. Spark will throw a runtime exception if an overflow occurs in any operation on integral/decimal field.
+    2. Spark will forbid using the reserved keywords of ANSI SQL as identifiers in the SQL parser.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.sql.storeAssignmentPolicy</code></td>
+  <td>ANSI</td>
+  <td>
+    When inserting a value into a column with different data type, Spark will perform type coercion.
+    Currently, we support 3 policies for the type coercion rules: ANSI, legacy and strict. With ANSI policy,
+    Spark performs the type coercion as per ANSI SQL. In practice, the behavior is mostly the same as PostgreSQL.
+    It disallows certain unreasonable type conversions such as converting string to int or double to boolean.
+    With legacy policy, Spark allows the type coercion as long as it is a valid Cast, which is very loose.
+    e.g. converting string to int or double to boolean is allowed.
+    It is also the only behavior in Spark 2.x and it is compatible with Hive.
+    With strict policy, Spark doesn't allow any possible precision loss or data truncation in type coercion,
+    e.g. converting double to int or decimal to double is not allowed.
+  </td>
+</tr>
+</table>
+
+The following subsections present behaviour changes in arithmetic operations, type conversions, and SQL parsing when the ANSI mode enabled.
+
+### Arithmetic Operations
+
+In Spark SQL, arithmetic operations performed on numeric types (with the exception of decimal) are not checked for overflows by default.
+This means that in case an operation causes overflows, the result is the same that the same operation returns in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number).
+On the other hand, Spark SQL returns null for decimal overflows.
+When `spark.sql.ansi.enabled` is set to `true` and an overflow occurs in numeric and interval arithmetic operations, it throws an arithmetic exception at runtime.
+
+{% highlight sql %}
+-- `spark.sql.ansi.enabled=true`
+SELECT 2147483647 + 1;
+
+  java.lang.ArithmeticException: integer overflow
+
+-- `spark.sql.ansi.enabled=false`
+SELECT 2147483647 + 1;
+
+  +----------------+
+  |(2147483647 + 1)|
+  +----------------+
+  |     -2147483648|
+  +----------------+
+
+{% endhighlight %}
+
+### Type Conversion
+
+Spark SQL has three kinds of type conversions: explicit casting, type coercion, and store assignment casting.
+When `spark.sql.ansi.enabled` is set to `true`, explicit casting by `CAST` syntax throws a runtime exception for illegal cast patterns defined in the standard, e.g. casts from a string to an integer.
+On the other hand, `INSERT INTO` syntax throws an analysis exception when the ANSI mode enabled via `spark.sql.storeAssignmentPolicy=ANSI`.
+
+Currently, the ANSI mode affects explicit casting and assignment casting only.
+In future releases, the behaviour of type coercion might change along with the other two type conversion rules.
+
+{% highlight sql %}
+-- Examples of explicit casting
+
+-- `spark.sql.ansi.enabled=true`
+SELECT CAST('a' AS INT);
+
+  java.lang.NumberFormatException: invalid input syntax for type numeric: a
+
+SELECT CAST(2147483648L AS INT);
+
+  java.lang.ArithmeticException: Casting 2147483648 to int causes overflow
+
+-- `spark.sql.ansi.enabled=false` (This is a default behaviour)
+SELECT CAST('a' AS INT);
+
+  +--------------+
+  |CAST(a AS INT)|
+  +--------------+
+  |          null|
+  +--------------+
+
+SELECT CAST(2147483648L AS INT);
+
+  +-----------------------+
+  |CAST(2147483648 AS INT)|
+  +-----------------------+
+  |            -2147483648|
+  +-----------------------+
+
+-- Examples of store assignment rules
+CREATE TABLE t (v INT);
+
+-- `spark.sql.storeAssignmentPolicy=ANSI`
+INSERT INTO t VALUES ('1');
+
+  org.apache.spark.sql.AnalysisException: Cannot write incompatible data to table '`default`.`t`':
+  - Cannot safely cast 'v': StringType to IntegerType;
+
+-- `spark.sql.storeAssignmentPolicy=LEGACY` (This is a legacy behaviour until Spark 2.x)
+INSERT INTO t VALUES ('1');
+SELECT * FROM t;
+
+  +---+
+  |  v|
+  +---+
+  |  1|
+  +---+
+
+{% endhighlight %}
+
+### SQL Keywords
+
 When `spark.sql.ansi.enabled` is true, Spark SQL will use the ANSI mode parser.
 In this mode, Spark SQL has two kinds of keywords:
 * Reserved keywords: Keywords that are reserved and can't be used as identifiers for table, view, column, function, alias, etc.
diff --git a/docs/sql-ref-arithmetic-ops.md b/docs/sql-ref-arithmetic-ops.md
deleted file mode 100644
index 7bc8ffe31c990..0000000000000
--- a/docs/sql-ref-arithmetic-ops.md
+++ /dev/null
@@ -1,22 +0,0 @@
----
-layout: global
-title: Arithmetic Operations
-displayTitle: Arithmetic Operations
-license: |
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
----
-
-Operations performed on numeric types (with the exception of decimal) are not checked for overflow. This means that in case an operation causes an overflow, the result is the same that the same operation returns in a Java/Scala program (eg. if the sum of 2 integers is higher than the maximum value representable, the result is a negative number).

From a4ceea6868002b88161517b14b94a2006be8af1b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 13 Feb 2020 20:09:24 +0100
Subject: [PATCH 037/185] [SPARK-30751][SQL] Combine the skewed readers into
 one in AQE skew join optimizations

<!--
Thanks for sending a pull request!  Here are some tips for you:
  1. If this is your first time, please read our contributor guidelines: https://spark.apache.org/contributing.html
  2. Ensure you have added or run the appropriate tests for your PR: https://spark.apache.org/developer-tools.html
  3. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP][SPARK-XXXX] Your PR title ...'.
  4. Be sure to keep the PR description updated to reflect all changes.
  5. Please write your PR title to summarize what this PR proposes.
  6. If possible, provide a concise example to reproduce the issue for a faster review.
-->

### What changes were proposed in this pull request?
<!--
Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster reviews in your PR. See the examples below.
  1. If you refactor some codes with changing classes, showing the class hierarchy will help reviewers.
  2. If you fix some SQL features, you can provide some references of other DBMSes.
  3. If there is design documentation, please add the link.
  4. If there is a discussion in the mailing list, please add the link.
-->
This is a followup of https://github.com/apache/spark/pull/26434

This PR use one special shuffle reader for skew join, so that we only have one join after optimization. In order to do that, this PR
1. add a very general `CustomShuffledRowRDD` which support all kind of partition arrangement.
2. move the logic of coalescing shuffle partitions to a util function, and call it during skew join optimization, to totally decouple with the `ReduceNumShufflePartitions` rule. It's too complicated to interfere skew join with `ReduceNumShufflePartitions`, as you need to consider the size of split partitions which don't respect target size already.

### Why are the changes needed?
<!--
Please clarify why the changes are needed. For instance,
  1. If you propose a new API, clarify the use case for a new API.
  2. If you fix a bug, you can clarify why it is a bug.
-->
The current skew join optimization has a serious performance issue: the size of the query plan depends on the number and size of skewed partitions.

### Does this PR introduce any user-facing change?
<!--
If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
If no, write 'No'.
-->
no

### How was this patch tested?
<!--
If tests were added, say they were added here. Please make sure to add some test cases that check the changes thoroughly including negative and positive cases if possible.
If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
If tests were not added, please describe why they were not added and/or why it was difficult to add.
-->
existing tests

test UI manually:
![image](https://user-images.githubusercontent.com/3182036/74357390-cfb30480-4dfa-11ea-83f6-825d1b9379ca.png)

explain output
```
AdaptiveSparkPlan(isFinalPlan=true)
+- OverwriteByExpression org.apache.spark.sql.execution.datasources.noop.NoopTable$403a2ed5, [AlwaysTrue()], org.apache.spark.sql.util.CaseInsensitiveStringMap1f
   +- *(5) SortMergeJoin(skew=true) [key1#2L], [key2#6L], Inner
      :- *(3) Sort [key1#2L ASC NULLS FIRST], false, 0
      :  +- SkewJoinShuffleReader 2 skewed partitions with size(max=5 KB, min=5 KB, avg=5 KB)
      :     +- ShuffleQueryStage 0
      :        +- Exchange hashpartitioning(key1#2L, 200), true, [id=#53]
      :           +- *(1) Project [(id#0L % 2) AS key1#2L]
      :              +- *(1) Filter isnotnull((id#0L % 2))
      :                 +- *(1) Range (0, 100000, step=1, splits=6)
      +- *(4) Sort [key2#6L ASC NULLS FIRST], false, 0
         +- SkewJoinShuffleReader 2 skewed partitions with size(max=5 KB, min=5 KB, avg=5 KB)
            +- ShuffleQueryStage 1
               +- Exchange hashpartitioning(key2#6L, 200), true, [id=#64]
                  +- *(2) Project [((id#4L % 2) + 1) AS key2#6L]
                     +- *(2) Filter isnotnull(((id#4L % 2) + 1))
                        +- *(2) Range (0, 100000, step=1, splits=6)
```

Closes #27493 from cloud-fan/aqe.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: herman <herman@databricks.com>
---
 .../spark/sql/execution/ShuffledRowRDD.scala  |  23 +-
 .../adaptive/CustomShuffledRowRDD.scala       | 113 +++++++
 .../adaptive/OptimizeLocalShuffleReader.scala |   2 +-
 .../adaptive/OptimizeSkewedJoin.scala         | 276 +++++++++++-------
 .../adaptive/ReduceNumShufflePartitions.scala | 157 ++--------
 .../adaptive/ShufflePartitionsCoalescer.scala | 112 +++++++
 .../adaptive/SkewedShuffledRowRDD.scala       |  78 -----
 .../exchange/ShuffleExchangeExec.scala        |  21 +-
 .../execution/joins/SortMergeJoinExec.scala   |  13 +-
 .../ReduceNumShufflePartitionsSuite.scala     | 210 +------------
 .../ShufflePartitionsCoalescerSuite.scala     | 220 ++++++++++++++
 .../adaptive/AdaptiveQueryExecSuite.scala     | 219 +++++---------
 12 files changed, 741 insertions(+), 703 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index efa493923ccc1..4c19f95796d04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -116,7 +116,7 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A
 class ShuffledRowRDD(
     var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
     metrics: Map[String, SQLMetric],
-    specifiedPartitionIndices: Option[Array[(Int, Int)]] = None)
+    specifiedPartitionStartIndices: Option[Array[Int]] = None)
   extends RDD[InternalRow](dependency.rdd.context, Nil) {
 
   if (SQLConf.get.fetchShuffleBlocksInBatchEnabled) {
@@ -126,8 +126,8 @@ class ShuffledRowRDD(
 
   private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions
 
-  private[this] val partitionStartIndices: Array[Int] = specifiedPartitionIndices match {
-    case Some(indices) => indices.map(_._1)
+  private[this] val partitionStartIndices: Array[Int] = specifiedPartitionStartIndices match {
+    case Some(indices) => indices
     case None =>
       // When specifiedPartitionStartIndices is not defined, every post-shuffle partition
       // corresponds to a pre-shuffle partition.
@@ -142,15 +142,16 @@ class ShuffledRowRDD(
   override val partitioner: Option[Partitioner] = Some(part)
 
   override def getPartitions: Array[Partition] = {
-    specifiedPartitionIndices match {
-      case Some(indices) =>
-        Array.tabulate[Partition](indices.length) { i =>
-          new ShuffledRowRDDPartition(i, indices(i)._1, indices(i)._2)
-        }
-      case None =>
-        Array.tabulate[Partition](numPreShufflePartitions) { i =>
-          new ShuffledRowRDDPartition(i, i, i + 1)
+    assert(partitionStartIndices.length == part.numPartitions)
+    Array.tabulate[Partition](partitionStartIndices.length) { i =>
+      val startIndex = partitionStartIndices(i)
+      val endIndex =
+        if (i < partitionStartIndices.length - 1) {
+          partitionStartIndices(i + 1)
+        } else {
+          numPreShufflePartitions
         }
+      new ShuffledRowRDDPartition(i, startIndex, endIndex)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala
new file mode 100644
index 0000000000000..5aba57443d632
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.{Dependency, MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter}
+
+sealed trait ShufflePartitionSpec
+
+// A partition that reads data of one reducer.
+case class SinglePartitionSpec(reducerIndex: Int) extends ShufflePartitionSpec
+
+// A partition that reads data of multiple reducers, from `startReducerIndex` (inclusive) to
+// `endReducerIndex` (exclusive).
+case class CoalescedPartitionSpec(
+    startReducerIndex: Int, endReducerIndex: Int) extends ShufflePartitionSpec
+
+// A partition that reads partial data of one reducer, from `startMapIndex` (inclusive) to
+// `endMapIndex` (exclusive).
+case class PartialPartitionSpec(
+    reducerIndex: Int, startMapIndex: Int, endMapIndex: Int) extends ShufflePartitionSpec
+
+private final case class CustomShufflePartition(
+    index: Int, spec: ShufflePartitionSpec) extends Partition
+
+// TODO: merge this with `ShuffledRowRDD`, and replace `LocalShuffledRowRDD` with this RDD.
+class CustomShuffledRowRDD(
+    var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
+    metrics: Map[String, SQLMetric],
+    partitionSpecs: Array[ShufflePartitionSpec])
+  extends RDD[InternalRow](dependency.rdd.context, Nil) {
+
+  override def getDependencies: Seq[Dependency[_]] = List(dependency)
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    dependency = null
+  }
+
+  override def getPartitions: Array[Partition] = {
+    Array.tabulate[Partition](partitionSpecs.length) { i =>
+      CustomShufflePartition(i, partitionSpecs(i))
+    }
+  }
+
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    partition.asInstanceOf[CustomShufflePartition].spec match {
+      case SinglePartitionSpec(reducerIndex) =>
+        tracker.getPreferredLocationsForShuffle(dependency, reducerIndex)
+
+      case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) =>
+        startReducerIndex.until(endReducerIndex).flatMap { reducerIndex =>
+          tracker.getPreferredLocationsForShuffle(dependency, reducerIndex)
+        }
+
+      case PartialPartitionSpec(_, startMapIndex, endMapIndex) =>
+        tracker.getMapLocation(dependency, startMapIndex, endMapIndex)
+    }
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
+    val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics()
+    // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator,
+    // as well as the `tempMetrics` for basic shuffle metrics.
+    val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics)
+    val reader = split.asInstanceOf[CustomShufflePartition].spec match {
+      case SinglePartitionSpec(reducerIndex) =>
+        SparkEnv.get.shuffleManager.getReader(
+          dependency.shuffleHandle,
+          reducerIndex,
+          reducerIndex + 1,
+          context,
+          sqlMetricsReporter)
+
+      case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) =>
+        SparkEnv.get.shuffleManager.getReader(
+          dependency.shuffleHandle,
+          startReducerIndex,
+          endReducerIndex,
+          context,
+          sqlMetricsReporter)
+
+      case PartialPartitionSpec(reducerIndex, startMapIndex, endMapIndex) =>
+        SparkEnv.get.shuffleManager.getReaderForRange(
+          dependency.shuffleHandle,
+          startMapIndex,
+          endMapIndex,
+          reducerIndex,
+          reducerIndex + 1,
+          context,
+          sqlMetricsReporter)
+    }
+    reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala
index a8d8f358ab660..e95441e28aafe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala
@@ -71,7 +71,7 @@ case class OptimizeLocalShuffleReader(conf: SQLConf) extends Rule[SparkPlan] {
     plan match {
       case c @ CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) =>
         LocalShuffleReaderExec(
-          s, getPartitionStartIndices(s, Some(c.partitionIndices.length)))
+          s, getPartitionStartIndices(s, Some(c.partitionStartIndices.length)))
       case s: ShuffleQueryStageExec =>
         LocalShuffleReaderExec(s, getPartitionStartIndices(s, None))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 74b7fbd317fc8..a716497c274b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.adaptive
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.commons.io.FileUtils
+
 import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -44,11 +46,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
    * partition size * spark.sql.adaptive.skewedPartitionFactor and also larger than
    * spark.sql.adaptive.skewedPartitionSizeThreshold.
    */
-  private def isSkewed(
-      stats: MapOutputStatistics,
-      partitionId: Int,
-      medianSize: Long): Boolean = {
-    val size = stats.bytesByPartitionId(partitionId)
+  private def isSkewed(size: Long, medianSize: Long): Boolean = {
     size > medianSize * conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR) &&
       size > conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD)
   }
@@ -108,12 +106,12 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
     stage.resultOption.get.asInstanceOf[MapOutputStatistics]
   }
 
-  private def supportSplitOnLeftPartition(joinType: JoinType) = {
+  private def canSplitLeftSide(joinType: JoinType) = {
     joinType == Inner || joinType == Cross || joinType == LeftSemi ||
       joinType == LeftAnti || joinType == LeftOuter
   }
 
-  private def supportSplitOnRightPartition(joinType: JoinType) = {
+  private def canSplitRightSide(joinType: JoinType) = {
     joinType == Inner || joinType == Cross || joinType == RightOuter
   }
 
@@ -130,17 +128,18 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
    * 1. Check whether the shuffle partition is skewed based on the median size
    *    and the skewed partition threshold in origin smj.
    * 2. Assuming partition0 is skewed in left side, and it has 5 mappers (Map0, Map1...Map4).
-   *    And we will split the 5 Mappers into 3 mapper ranges [(Map0, Map1), (Map2, Map3), (Map4)]
+   *    And we may split the 5 Mappers into 3 mapper ranges [(Map0, Map1), (Map2, Map3), (Map4)]
    *    based on the map size and the max split number.
-   * 3. Create the 3 smjs with separately reading the above mapper ranges and then join with
-   *    the Partition0 in right side.
-   * 4. Finally union the above 3 split smjs and the origin smj.
+   * 3. Wrap the join left child with a special shuffle reader that reads each mapper range with one
+   *    task, so total 3 tasks.
+   * 4. Wrap the join right child with a special shuffle reader that reads partition0 3 times by
+   *    3 tasks separately.
    */
   def optimizeSkewJoin(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case smj @ SortMergeJoinExec(leftKeys, rightKeys, joinType, condition,
+    case smj @ SortMergeJoinExec(_, _, joinType, _,
         s1 @ SortExec(_, _, left: ShuffleQueryStageExec, _),
         s2 @ SortExec(_, _, right: ShuffleQueryStageExec, _), _)
-      if (supportedJoinTypes.contains(joinType)) =>
+        if supportedJoinTypes.contains(joinType) =>
       val leftStats = getStatistics(left)
       val rightStats = getStatistics(right)
       val numPartitions = leftStats.bytesByPartitionId.length
@@ -155,61 +154,134 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
           |Right side partition size:
           |${getSizeInfo(rightMedSize, rightStats.bytesByPartitionId.max)}
         """.stripMargin)
+      val canSplitLeft = canSplitLeftSide(joinType)
+      val canSplitRight = canSplitRightSide(joinType)
+
+      val leftSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec]
+      val rightSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec]
+      // This is used to delay the creation of non-skew partitions so that we can potentially
+      // coalesce them like `ReduceNumShufflePartitions` does.
+      val nonSkewPartitionIndices = mutable.ArrayBuffer.empty[Int]
+      val leftSkewDesc = new SkewDesc
+      val rightSkewDesc = new SkewDesc
+      for (partitionIndex <- 0 until numPartitions) {
+        val leftSize = leftStats.bytesByPartitionId(partitionIndex)
+        val isLeftSkew = isSkewed(leftSize, leftMedSize) && canSplitLeft
+        val rightSize = rightStats.bytesByPartitionId(partitionIndex)
+        val isRightSkew = isSkewed(rightSize, rightMedSize) && canSplitRight
+        if (isLeftSkew || isRightSkew) {
+          if (nonSkewPartitionIndices.nonEmpty) {
+            // As soon as we see a skew, we'll "flush" out unhandled non-skew partitions.
+            createNonSkewPartitions(leftStats, rightStats, nonSkewPartitionIndices).foreach { p =>
+              leftSidePartitions += p
+              rightSidePartitions += p
+            }
+            nonSkewPartitionIndices.clear()
+          }
 
-      val skewedPartitions = mutable.HashSet[Int]()
-      val subJoins = mutable.ArrayBuffer[SparkPlan]()
-      for (partitionId <- 0 until numPartitions) {
-        val isLeftSkew = isSkewed(leftStats, partitionId, leftMedSize)
-        val isRightSkew = isSkewed(rightStats, partitionId, rightMedSize)
-        val leftMapIdStartIndices = if (isLeftSkew && supportSplitOnLeftPartition(joinType)) {
-          getMapStartIndices(left, partitionId)
-        } else {
-          Array(0)
-        }
-        val rightMapIdStartIndices = if (isRightSkew && supportSplitOnRightPartition(joinType)) {
-          getMapStartIndices(right, partitionId)
-        } else {
-          Array(0)
-        }
+          val leftParts = if (isLeftSkew) {
+            leftSkewDesc.addPartitionSize(leftSize)
+            createSkewPartitions(
+              partitionIndex,
+              getMapStartIndices(left, partitionIndex),
+              getNumMappers(left))
+          } else {
+            Seq(SinglePartitionSpec(partitionIndex))
+          }
 
-        if (leftMapIdStartIndices.length > 1 || rightMapIdStartIndices.length > 1) {
-          skewedPartitions += partitionId
-          for (i <- 0 until leftMapIdStartIndices.length;
-               j <- 0 until rightMapIdStartIndices.length) {
-            val leftEndMapId = if (i == leftMapIdStartIndices.length - 1) {
-              getNumMappers(left)
-            } else {
-              leftMapIdStartIndices(i + 1)
-            }
-            val rightEndMapId = if (j == rightMapIdStartIndices.length - 1) {
-              getNumMappers(right)
-            } else {
-              rightMapIdStartIndices(j + 1)
+          val rightParts = if (isRightSkew) {
+            rightSkewDesc.addPartitionSize(rightSize)
+            createSkewPartitions(
+              partitionIndex,
+              getMapStartIndices(right, partitionIndex),
+              getNumMappers(right))
+          } else {
+            Seq(SinglePartitionSpec(partitionIndex))
+          }
+
+          for {
+            leftSidePartition <- leftParts
+            rightSidePartition <- rightParts
+          } {
+            leftSidePartitions += leftSidePartition
+            rightSidePartitions += rightSidePartition
+          }
+        } else {
+          // Add to `nonSkewPartitionIndices` first, and add real partitions later, in case we can
+          // coalesce the non-skew partitions.
+          nonSkewPartitionIndices += partitionIndex
+          // If this is the last partition, add real partition immediately.
+          if (partitionIndex == numPartitions - 1) {
+            createNonSkewPartitions(leftStats, rightStats, nonSkewPartitionIndices).foreach { p =>
+              leftSidePartitions += p
+              rightSidePartitions += p
             }
-            // TODO: we may can optimize the sort merge join to broad cast join after
-            //       obtaining the raw data size of per partition,
-            val leftSkewedReader = SkewedPartitionReaderExec(
-              left, partitionId, leftMapIdStartIndices(i), leftEndMapId)
-            val rightSkewedReader = SkewedPartitionReaderExec(right, partitionId,
-              rightMapIdStartIndices(j), rightEndMapId)
-            subJoins += SortMergeJoinExec(leftKeys, rightKeys, joinType, condition,
-              s1.copy(child = leftSkewedReader), s2.copy(child = rightSkewedReader), true)
+            nonSkewPartitionIndices.clear()
           }
         }
       }
-      logDebug(s"number of skewed partitions is ${skewedPartitions.size}")
-      if (skewedPartitions.nonEmpty) {
-        val optimizedSmj = smj.copy(
-          left = s1.copy(child = PartialShuffleReaderExec(left, skewedPartitions.toSet)),
-          right = s2.copy(child = PartialShuffleReaderExec(right, skewedPartitions.toSet)),
-          isPartial = true)
-        subJoins += optimizedSmj
-        UnionExec(subJoins)
+
+      logDebug("number of skewed partitions: " +
+        s"left ${leftSkewDesc.numPartitions}, right ${rightSkewDesc.numPartitions}")
+      if (leftSkewDesc.numPartitions > 0 || rightSkewDesc.numPartitions > 0) {
+        val newLeft = SkewJoinShuffleReaderExec(
+          left, leftSidePartitions.toArray, leftSkewDesc.toString)
+        val newRight = SkewJoinShuffleReaderExec(
+          right, rightSidePartitions.toArray, rightSkewDesc.toString)
+        smj.copy(
+          left = s1.copy(child = newLeft), right = s2.copy(child = newRight), isSkewJoin = true)
       } else {
         smj
       }
   }
 
+  private def createNonSkewPartitions(
+      leftStats: MapOutputStatistics,
+      rightStats: MapOutputStatistics,
+      nonSkewPartitionIndices: Seq[Int]): Seq[ShufflePartitionSpec] = {
+    assert(nonSkewPartitionIndices.nonEmpty)
+    if (nonSkewPartitionIndices.length == 1) {
+      Seq(SinglePartitionSpec(nonSkewPartitionIndices.head))
+    } else {
+      val startIndices = ShufflePartitionsCoalescer.coalescePartitions(
+        Array(leftStats, rightStats),
+        firstPartitionIndex = nonSkewPartitionIndices.head,
+        // `lastPartitionIndex` is exclusive.
+        lastPartitionIndex = nonSkewPartitionIndices.last + 1,
+        advisoryTargetSize = conf.targetPostShuffleInputSize)
+      startIndices.indices.map { i =>
+        val startIndex = startIndices(i)
+        val endIndex = if (i == startIndices.length - 1) {
+          // `endIndex` is exclusive.
+          nonSkewPartitionIndices.last + 1
+        } else {
+          startIndices(i + 1)
+        }
+        // Do not create `CoalescedPartitionSpec` if only need to read a singe partition.
+        if (startIndex + 1 == endIndex) {
+          SinglePartitionSpec(startIndex)
+        } else {
+          CoalescedPartitionSpec(startIndex, endIndex)
+        }
+      }
+    }
+  }
+
+  private def createSkewPartitions(
+      reducerIndex: Int,
+      mapStartIndices: Array[Int],
+      numMappers: Int): Seq[PartialPartitionSpec] = {
+    mapStartIndices.indices.map { i =>
+      val startMapIndex = mapStartIndices(i)
+      val endMapIndex = if (i == mapStartIndices.length - 1) {
+        numMappers
+      } else {
+        mapStartIndices(i + 1)
+      }
+      PartialPartitionSpec(reducerIndex, startMapIndex, endMapIndex)
+    }
+  }
+
   override def apply(plan: SparkPlan): SparkPlan = {
     if (!conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED)) {
       return plan
@@ -248,79 +320,69 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
   }
 }
 
-/**
- * A wrapper of shuffle query stage, which submits one reduce task to read a single
- * shuffle partition 'partitionIndex' produced by the mappers in range [startMapIndex, endMapIndex).
- * This is used to increase the parallelism when reading skewed partitions.
- *
- * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange
- *              node during canonicalization.
- * @param partitionIndex The pre shuffle partition index.
- * @param startMapIndex The start map index.
- * @param endMapIndex The end map index.
- */
-case class SkewedPartitionReaderExec(
-    child: QueryStageExec,
-    partitionIndex: Int,
-    startMapIndex: Int,
-    endMapIndex: Int) extends LeafExecNode {
+private class SkewDesc {
+  private[this] var numSkewedPartitions: Int = 0
+  private[this] var totalSize: Long = 0
+  private[this] var maxSize: Long = 0
+  private[this] var minSize: Long = 0
 
-  override def output: Seq[Attribute] = child.output
+  def numPartitions: Int = numSkewedPartitions
 
-  override def outputPartitioning: Partitioning = {
-    UnknownPartitioning(1)
+  def addPartitionSize(size: Long): Unit = {
+    if (numSkewedPartitions == 0) {
+      maxSize = size
+      minSize = size
+    }
+    numSkewedPartitions += 1
+    totalSize += size
+    if (size > maxSize) maxSize = size
+    if (size < minSize) minSize = size
   }
-  private var cachedSkewedShuffleRDD: SkewedShuffledRowRDD = null
 
-  override def doExecute(): RDD[InternalRow] = {
-    if (cachedSkewedShuffleRDD == null) {
-      cachedSkewedShuffleRDD = child match {
-        case stage: ShuffleQueryStageExec =>
-          stage.shuffle.createSkewedShuffleRDD(partitionIndex, startMapIndex, endMapIndex)
-        case _ =>
-          throw new IllegalStateException("operating on canonicalization plan")
-      }
+  override def toString: String = {
+    if (numSkewedPartitions == 0) {
+      "no skewed partition"
+    } else {
+      val maxSizeStr = FileUtils.byteCountToDisplaySize(maxSize)
+      val minSizeStr = FileUtils.byteCountToDisplaySize(minSize)
+      val avgSizeStr = FileUtils.byteCountToDisplaySize(totalSize / numSkewedPartitions)
+      s"$numSkewedPartitions skewed partitions with " +
+        s"size(max=$maxSizeStr, min=$minSizeStr, avg=$avgSizeStr)"
     }
-    cachedSkewedShuffleRDD
   }
 }
 
 /**
- * A wrapper of shuffle query stage, which skips some partitions when reading the shuffle blocks.
+ * A wrapper of shuffle query stage, which follows the given partition arrangement.
  *
  * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during
  *              canonicalization.
- * @param excludedPartitions The partitions to skip when reading.
+ * @param partitionSpecs The partition specs that defines the arrangement.
+ * @param skewDesc The description of the skewed partitions.
  */
-case class PartialShuffleReaderExec(
-    child: QueryStageExec,
-    excludedPartitions: Set[Int]) extends UnaryExecNode {
+case class SkewJoinShuffleReaderExec(
+    child: SparkPlan,
+    partitionSpecs: Array[ShufflePartitionSpec],
+    skewDesc: String) extends UnaryExecNode {
 
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = {
-    UnknownPartitioning(1)
+    UnknownPartitioning(partitionSpecs.length)
   }
 
-  private def shuffleExchange(): ShuffleExchangeExec = child match {
-    case stage: ShuffleQueryStageExec => stage.shuffle
-    case _ =>
-      throw new IllegalStateException("operating on canonicalization plan")
-  }
-
-  private def getPartitionIndexRanges(): Array[(Int, Int)] = {
-    val length = shuffleExchange().shuffleDependency.partitioner.numPartitions
-    (0 until length).filterNot(excludedPartitions.contains).map(i => (i, i + 1)).toArray
-  }
+  override def stringArgs: Iterator[Any] = Iterator(skewDesc)
 
   private var cachedShuffleRDD: RDD[InternalRow] = null
 
-  override def doExecute(): RDD[InternalRow] = {
+  override protected def doExecute(): RDD[InternalRow] = {
     if (cachedShuffleRDD == null) {
-      cachedShuffleRDD = if (excludedPartitions.isEmpty) {
-        child.execute()
-      } else {
-        shuffleExchange().createShuffledRDD(Some(getPartitionIndexRanges()))
+      cachedShuffleRDD = child match {
+        case stage: ShuffleQueryStageExec =>
+          new CustomShuffledRowRDD(
+            stage.shuffle.shuffleDependency, stage.shuffle.readMetrics, partitionSpecs)
+        case _ =>
+          throw new IllegalStateException("operating on canonicalization plan")
       }
     }
     cachedShuffleRDD
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala
index 2c50b638b4d45..5bbcb14e008d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.adaptive
 
-import scala.collection.mutable.{ArrayBuffer, HashSet}
-
 import org.apache.spark.MapOutputStatistics
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -29,24 +27,8 @@ import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
- * A rule to adjust the post shuffle partitions based on the map output statistics.
- *
- * The strategy used to determine the number of post-shuffle partitions is described as follows.
- * To determine the number of post-shuffle partitions, we have a target input size for a
- * post-shuffle partition. Once we have size statistics of all pre-shuffle partitions, we will do
- * a pass of those statistics and pack pre-shuffle partitions with continuous indices to a single
- * post-shuffle partition until adding another pre-shuffle partition would cause the size of a
- * post-shuffle partition to be greater than the target size.
- *
- * For example, we have two stages with the following pre-shuffle partition size statistics:
- * stage 1: [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB]
- * stage 2: [10 MiB,  10 MiB, 70 MiB,  5 MiB, 5 MiB]
- * assuming the target input size is 128 MiB, we will have four post-shuffle partitions,
- * which are:
- *  - post-shuffle partition 0: pre-shuffle partition 0 (size 110 MiB)
- *  - post-shuffle partition 1: pre-shuffle partition 1 (size 30 MiB)
- *  - post-shuffle partition 2: pre-shuffle partition 2 (size 170 MiB)
- *  - post-shuffle partition 3: pre-shuffle partition 3 and 4 (size 50 MiB)
+ * A rule to reduce the post shuffle partitions based on the map output statistics, which can
+ * avoid many small reduce tasks that hurt performance.
  */
 case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] {
 
@@ -54,28 +36,21 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] {
     if (!conf.reducePostShufflePartitionsEnabled) {
       return plan
     }
-    // 'SkewedShufflePartitionReader' is added by us, so it's safe to ignore it when changing
-    // number of reducers.
-    val leafNodes = plan.collectLeaves().filter(!_.isInstanceOf[SkewedPartitionReaderExec])
-    if (!leafNodes.forall(_.isInstanceOf[QueryStageExec])) {
+    if (!plan.collectLeaves().forall(_.isInstanceOf[QueryStageExec])) {
       // If not all leaf nodes are query stages, it's not safe to reduce the number of
       // shuffle partitions, because we may break the assumption that all children of a spark plan
       // have same number of output partitions.
       return plan
     }
 
-    def collectShuffles(plan: SparkPlan): Seq[SparkPlan] = plan match {
+    def collectShuffleStages(plan: SparkPlan): Seq[ShuffleQueryStageExec] = plan match {
       case _: LocalShuffleReaderExec => Nil
-      case p: PartialShuffleReaderExec => Seq(p)
+      case _: SkewJoinShuffleReaderExec => Nil
       case stage: ShuffleQueryStageExec => Seq(stage)
-      case _ => plan.children.flatMap(collectShuffles)
+      case _ => plan.children.flatMap(collectShuffleStages)
     }
 
-    val shuffles = collectShuffles(plan)
-    val shuffleStages = shuffles.map {
-      case PartialShuffleReaderExec(s: ShuffleQueryStageExec, _) => s
-      case s: ShuffleQueryStageExec => s
-    }
+    val shuffleStages = collectShuffleStages(plan)
     // ShuffleExchanges introduced by repartition do not support changing the number of partitions.
     // We change the number of partitions in the stage only if all the ShuffleExchanges support it.
     if (!shuffleStages.forall(_.shuffle.canChangeNumPartitions)) {
@@ -94,110 +69,27 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] {
       // partition) and a result of a SortMergeJoin (multiple partitions).
       val distinctNumPreShufflePartitions =
         validMetrics.map(stats => stats.bytesByPartitionId.length).distinct
-      val distinctExcludedPartitions = shuffles.map {
-        case PartialShuffleReaderExec(_, excludedPartitions) => excludedPartitions
-        case _: ShuffleQueryStageExec => Set.empty[Int]
-      }.distinct
-      if (validMetrics.nonEmpty && distinctNumPreShufflePartitions.length == 1
-        && distinctExcludedPartitions.length == 1) {
-        val excludedPartitions = distinctExcludedPartitions.head
-        val partitionIndices = estimatePartitionStartAndEndIndices(
-          validMetrics.toArray, excludedPartitions)
+      if (validMetrics.nonEmpty && distinctNumPreShufflePartitions.length == 1) {
+        val partitionStartIndices = ShufflePartitionsCoalescer.coalescePartitions(
+          validMetrics.toArray,
+          firstPartitionIndex = 0,
+          lastPartitionIndex = distinctNumPreShufflePartitions.head,
+          advisoryTargetSize = conf.targetPostShuffleInputSize,
+          minNumPartitions = conf.minNumPostShufflePartitions)
         // This transformation adds new nodes, so we must use `transformUp` here.
-        // Even for shuffle exchange whose input RDD has 0 partition, we should still update its
-        // `partitionStartIndices`, so that all the leaf shuffles in a stage have the same
-        // number of output partitions.
-        val visitedStages = HashSet.empty[Int]
-        plan.transformDown {
-          // Replace `PartialShuffleReaderExec` with `CoalescedShuffleReaderExec`, which keeps the
-          // "excludedPartition" requirement and also merges some partitions.
-          case PartialShuffleReaderExec(stage: ShuffleQueryStageExec, _) =>
-            visitedStages.add(stage.id)
-            CoalescedShuffleReaderExec(stage, partitionIndices)
-
-          // We are doing `transformDown`, so the `ShuffleQueryStageExec` may already be optimized
-          // and wrapped by `CoalescedShuffleReaderExec`.
-          case stage: ShuffleQueryStageExec if !visitedStages.contains(stage.id) =>
-            visitedStages.add(stage.id)
-            CoalescedShuffleReaderExec(stage, partitionIndices)
+        val stageIds = shuffleStages.map(_.id).toSet
+        plan.transformUp {
+          // even for shuffle exchange whose input RDD has 0 partition, we should still update its
+          // `partitionStartIndices`, so that all the leaf shuffles in a stage have the same
+          // number of output partitions.
+          case stage: ShuffleQueryStageExec if stageIds.contains(stage.id) =>
+            CoalescedShuffleReaderExec(stage, partitionStartIndices)
         }
       } else {
         plan
       }
     }
   }
-
-  /**
-   * Estimates partition start and end indices for post-shuffle partitions based on
-   * mapOutputStatistics provided by all pre-shuffle stages and skip the omittedPartitions
-   * already handled in skewed partition optimization.
-   */
-  // visible for testing.
-  private[sql] def estimatePartitionStartAndEndIndices(
-      mapOutputStatistics: Array[MapOutputStatistics],
-      excludedPartitions: Set[Int] = Set.empty): Array[(Int, Int)] = {
-    val minNumPostShufflePartitions = conf.minNumPostShufflePartitions - excludedPartitions.size
-    val advisoryTargetPostShuffleInputSize = conf.targetPostShuffleInputSize
-    // If minNumPostShufflePartitions is defined, it is possible that we need to use a
-    // value less than advisoryTargetPostShuffleInputSize as the target input size of
-    // a post shuffle task.
-    val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum
-    // The max at here is to make sure that when we have an empty table, we
-    // only have a single post-shuffle partition.
-    // There is no particular reason that we pick 16. We just need a number to
-    // prevent maxPostShuffleInputSize from being set to 0.
-    val maxPostShuffleInputSize = math.max(
-      math.ceil(totalPostShuffleInputSize / minNumPostShufflePartitions.toDouble).toLong, 16)
-    val targetPostShuffleInputSize =
-      math.min(maxPostShuffleInputSize, advisoryTargetPostShuffleInputSize)
-
-    logInfo(
-      s"advisoryTargetPostShuffleInputSize: $advisoryTargetPostShuffleInputSize, " +
-        s"targetPostShuffleInputSize $targetPostShuffleInputSize.")
-
-    // Make sure we do get the same number of pre-shuffle partitions for those stages.
-    val distinctNumPreShufflePartitions =
-      mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct
-    // The reason that we are expecting a single value of the number of pre-shuffle partitions
-    // is that when we add Exchanges, we set the number of pre-shuffle partitions
-    // (i.e. map output partitions) using a static setting, which is the value of
-    // spark.sql.shuffle.partitions. Even if two input RDDs are having different
-    // number of partitions, they will have the same number of pre-shuffle partitions
-    // (i.e. map output partitions).
-    assert(
-      distinctNumPreShufflePartitions.length == 1,
-      "There should be only one distinct value of the number pre-shuffle partitions " +
-        "among registered Exchange operator.")
-
-    val partitionStartIndices = ArrayBuffer[Int]()
-    val partitionEndIndices = ArrayBuffer[Int]()
-    val numPartitions = distinctNumPreShufflePartitions.head
-    val includedPartitions = (0 until numPartitions).filter(!excludedPartitions.contains(_))
-    val firstStartIndex = includedPartitions(0)
-    partitionStartIndices += firstStartIndex
-    var postShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId(firstStartIndex)).sum
-    var i = firstStartIndex
-    includedPartitions.drop(1).foreach { nextPartitionIndex =>
-        val nextShuffleInputSize =
-          mapOutputStatistics.map(_.bytesByPartitionId(nextPartitionIndex)).sum
-        // If nextPartitionIndices is skewed and omitted, or including
-        // the nextShuffleInputSize would exceed the target partition size,
-        // then start a new partition.
-        if (nextPartitionIndex != i + 1 ||
-          (postShuffleInputSize + nextShuffleInputSize > targetPostShuffleInputSize)) {
-          partitionEndIndices += i + 1
-          partitionStartIndices += nextPartitionIndex
-          // reset postShuffleInputSize.
-          postShuffleInputSize = nextShuffleInputSize
-          i = nextPartitionIndex
-        } else {
-          postShuffleInputSize += nextShuffleInputSize
-          i += 1
-        }
-    }
-    partitionEndIndices += i + 1
-    partitionStartIndices.zip(partitionEndIndices).toArray
-  }
 }
 
 /**
@@ -206,15 +98,16 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] {
  *
  * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during
  *              canonicalization.
+ * @param partitionStartIndices The start partition indices for the coalesced partitions.
  */
 case class CoalescedShuffleReaderExec(
     child: SparkPlan,
-    partitionIndices: Array[(Int, Int)]) extends UnaryExecNode {
+    partitionStartIndices: Array[Int]) extends UnaryExecNode {
 
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = {
-    UnknownPartitioning(partitionIndices.length)
+    UnknownPartitioning(partitionStartIndices.length)
   }
 
   private var cachedShuffleRDD: ShuffledRowRDD = null
@@ -223,7 +116,7 @@ case class CoalescedShuffleReaderExec(
     if (cachedShuffleRDD == null) {
       cachedShuffleRDD = child match {
         case stage: ShuffleQueryStageExec =>
-          stage.shuffle.createShuffledRDD(Some(partitionIndices))
+          stage.shuffle.createShuffledRDD(Some(partitionStartIndices))
         case _ =>
           throw new IllegalStateException("operating on canonicalization plan")
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala
new file mode 100644
index 0000000000000..18f0585524aa2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.MapOutputStatistics
+import org.apache.spark.internal.Logging
+
+object ShufflePartitionsCoalescer extends Logging {
+
+  /**
+   * Coalesce the same range of partitions (`firstPartitionIndex`` to `lastPartitionIndex`, the
+   * start is inclusive and the end is exclusive) from multiple shuffles. This method assumes that
+   * all the shuffles have the same number of partitions, and the partitions of same index will be
+   * read together by one task.
+   *
+   * The strategy used to determine the number of coalesced partitions is described as follows.
+   * To determine the number of coalesced partitions, we have a target size for a coalesced
+   * partition. Once we have size statistics of all shuffle partitions, we will do
+   * a pass of those statistics and pack shuffle partitions with continuous indices to a single
+   * coalesced partition until adding another shuffle partition would cause the size of a
+   * coalesced partition to be greater than the target size.
+   *
+   * For example, we have two shuffles with the following partition size statistics:
+   *  - shuffle 1 (5 partitions): [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB]
+   *  - shuffle 2 (5 partitions): [10 MiB,  10 MiB, 70 MiB,  5 MiB, 5 MiB]
+   * Assuming the target size is 128 MiB, we will have 4 coalesced partitions, which are:
+   *  - coalesced partition 0: shuffle partition 0 (size 110 MiB)
+   *  - coalesced partition 1: shuffle partition 1 (size 30 MiB)
+   *  - coalesced partition 2: shuffle partition 2 (size 170 MiB)
+   *  - coalesced partition 3: shuffle partition 3 and 4 (size 50 MiB)
+   *
+   *  @return An array of partition indices which represents the coalesced partitions. For example,
+   *          [0, 2, 3] means 3 coalesced partitions: [0, 2), [2, 3), [3, lastPartitionIndex]
+   */
+  def coalescePartitions(
+      mapOutputStatistics: Array[MapOutputStatistics],
+      firstPartitionIndex: Int,
+      lastPartitionIndex: Int,
+      advisoryTargetSize: Long,
+      minNumPartitions: Int = 1): Array[Int] = {
+    // If `minNumPartitions` is very large, it is possible that we need to use a value less than
+    // `advisoryTargetSize` as the target size of a coalesced task.
+    val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum
+    // The max at here is to make sure that when we have an empty table, we only have a single
+    // coalesced partition.
+    // There is no particular reason that we pick 16. We just need a number to prevent
+    // `maxTargetSize` from being set to 0.
+    val maxTargetSize = math.max(
+      math.ceil(totalPostShuffleInputSize / minNumPartitions.toDouble).toLong, 16)
+    val targetSize = math.min(maxTargetSize, advisoryTargetSize)
+
+    logInfo(s"advisory target size: $advisoryTargetSize, actual target size $targetSize.")
+
+    // Make sure these shuffles have the same number of partitions.
+    val distinctNumShufflePartitions =
+      mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct
+    // The reason that we are expecting a single value of the number of shuffle partitions
+    // is that when we add Exchanges, we set the number of shuffle partitions
+    // (i.e. map output partitions) using a static setting, which is the value of
+    // `spark.sql.shuffle.partitions`. Even if two input RDDs are having different
+    // number of partitions, they will have the same number of shuffle partitions
+    // (i.e. map output partitions).
+    assert(
+      distinctNumShufflePartitions.length == 1,
+      "There should be only one distinct value of the number of shuffle partitions " +
+        "among registered Exchange operators.")
+
+    val splitPoints = ArrayBuffer[Int]()
+    splitPoints += firstPartitionIndex
+    var coalescedSize = 0L
+    var i = firstPartitionIndex
+    while (i < lastPartitionIndex) {
+      // We calculate the total size of i-th shuffle partitions from all shuffles.
+      var totalSizeOfCurrentPartition = 0L
+      var j = 0
+      while (j < mapOutputStatistics.length) {
+        totalSizeOfCurrentPartition += mapOutputStatistics(j).bytesByPartitionId(i)
+        j += 1
+      }
+
+      // If including the `totalSizeOfCurrentPartition` would exceed the target size, then start a
+      // new coalesced partition.
+      if (i > firstPartitionIndex && coalescedSize + totalSizeOfCurrentPartition > targetSize) {
+        splitPoints += i
+        // reset postShuffleInputSize.
+        coalescedSize = totalSizeOfCurrentPartition
+      } else {
+        coalescedSize += totalSizeOfCurrentPartition
+      }
+      i += 1
+    }
+
+    splitPoints.toArray
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala
deleted file mode 100644
index 52f793b24aa17..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.adaptive
-
-import org.apache.spark._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter}
-
-/**
- * The [[Partition]] used by [[SkewedShuffledRowRDD]].
- */
-class SkewedShuffledRowRDDPartition(override val index: Int) extends Partition
-
-/**
- * This is a specialized version of [[org.apache.spark.sql.execution.ShuffledRowRDD]]. This is used
- * in Spark SQL adaptive execution to solve data skew issues. This RDD includes rearranged
- * partitions from mappers.
- *
- * This RDD takes a [[ShuffleDependency]] (`dependency`), a partitionIndex
- * and the range of startMapIndex to endMapIndex.
- */
-class SkewedShuffledRowRDD(
-     var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
-     partitionIndex: Int,
-     startMapIndex: Int,
-     endMapIndex: Int,
-     metrics: Map[String, SQLMetric])
-  extends RDD[InternalRow](dependency.rdd.context, Nil) {
-
-  override def getDependencies: Seq[Dependency[_]] = List(dependency)
-
-  override def getPartitions: Array[Partition] = {
-    Array(new SkewedShuffledRowRDDPartition(0))
-  }
-
-  override def getPreferredLocations(partition: Partition): Seq[String] = {
-    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
-    tracker.getMapLocation(dependency, startMapIndex, endMapIndex)
-  }
-
-  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
-    val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics()
-    // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator,
-    // as well as the `tempMetrics` for basic shuffle metrics.
-    val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics)
-
-    val reader = SparkEnv.get.shuffleManager.getReaderForRange(
-      dependency.shuffleHandle,
-      startMapIndex,
-      endMapIndex,
-      partitionIndex,
-      partitionIndex + 1,
-      context,
-      sqlMetricsReporter)
-    reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2)
-  }
-
-  override def clearDependencies() {
-    super.clearDependencies()
-    dependency = null
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index ffcd6c7783354..4b08da043b83e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -30,11 +30,11 @@ import org.apache.spark.shuffle.{ShuffleWriteMetricsReporter, ShuffleWriteProces
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Divide, Literal, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{LocalShuffledRowRDD, SkewedShuffledRowRDD}
+import org.apache.spark.sql.execution.adaptive.LocalShuffledRowRDD
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
@@ -49,11 +49,9 @@ case class ShuffleExchangeExec(
     child: SparkPlan,
     canChangeNumPartitions: Boolean = true) extends Exchange {
 
-  // NOTE: coordinator can be null after serialization/deserialization,
-  //       e.g. it can be null on the Executor side
   private lazy val writeMetrics =
     SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
-  private lazy val readMetrics =
+  private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
   override lazy val metrics = Map(
     "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size")
@@ -90,9 +88,8 @@ case class ShuffleExchangeExec(
       writeMetrics)
   }
 
-  def createShuffledRDD(
-      partitionRanges: Option[Array[(Int, Int)]]): ShuffledRowRDD = {
-    new ShuffledRowRDD(shuffleDependency, readMetrics, partitionRanges)
+  def createShuffledRDD(partitionStartIndices: Option[Array[Int]]): ShuffledRowRDD = {
+    new ShuffledRowRDD(shuffleDependency, readMetrics, partitionStartIndices)
   }
 
   def createLocalShuffleRDD(
@@ -100,14 +97,6 @@ case class ShuffleExchangeExec(
     new LocalShuffledRowRDD(shuffleDependency, readMetrics, partitionStartIndicesPerMapper)
   }
 
-  def createSkewedShuffleRDD(
-      partitionIndex: Int,
-      startMapIndex: Int,
-      endMapIndex: Int): SkewedShuffledRowRDD = {
-    new SkewedShuffledRowRDD(shuffleDependency,
-      partitionIndex, startMapIndex, endMapIndex, readMetrics)
-  }
-
   /**
    * Caches the created ShuffleRowRDD so we can reuse that.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 6384aed6a78e0..62eea611556ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{PartialShuffleReaderExec, SkewedPartitionReaderExec}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.util.collection.BitSet
 
@@ -42,11 +41,17 @@ case class SortMergeJoinExec(
     condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan,
-    isPartial: Boolean = false) extends BinaryExecNode with CodegenSupport {
+    isSkewJoin: Boolean = false) extends BinaryExecNode with CodegenSupport {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
+  override def nodeName: String = {
+    if (isSkewJoin) super.nodeName + "(skew=true)" else super.nodeName
+  }
+
+  override def stringArgs: Iterator[Any] = super.stringArgs.toSeq.dropRight(1).iterator
+
   override def simpleStringWithNodeId(): String = {
     val opId = ExplainUtils.getOpId(this)
     s"$nodeName $joinType ($opId)".trim
@@ -98,7 +103,9 @@ case class SortMergeJoinExec(
   }
 
   override def requiredChildDistribution: Seq[Distribution] = {
-    if (isPartial) {
+    if (isSkewJoin) {
+      // We re-arrange the shuffle partitions to deal with skew join, and the new children
+      // partitioning doesn't satisfy `HashClusteredDistribution`.
       UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
     } else {
       HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala
index 04b4d4f29f850..5565a0dd01840 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.execution
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.{MapOutputStatistics, SparkConf, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.internal.config.UI.UI_ENABLED
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.adaptive._
-import org.apache.spark.sql.execution.adaptive.{CoalescedShuffleReaderExec, ReduceNumShufflePartitions}
+import org.apache.spark.sql.execution.adaptive.CoalescedShuffleReaderExec
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -52,212 +52,6 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA
     }
   }
 
-  private def checkEstimation(
-      rule: ReduceNumShufflePartitions,
-      bytesByPartitionIdArray: Array[Array[Long]],
-      expectedPartitionStartIndices: Array[Int]): Unit = {
-    val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map {
-      case (bytesByPartitionId, index) =>
-        new MapOutputStatistics(index, bytesByPartitionId)
-    }
-    val estimatedPartitionStartIndices =
-      rule.estimatePartitionStartAndEndIndices(mapOutputStatistics).map(_._1)
-    assert(estimatedPartitionStartIndices === expectedPartitionStartIndices)
-  }
-
-  private def createReduceNumShufflePartitionsRule(
-      advisoryTargetPostShuffleInputSize: Long,
-      minNumPostShufflePartitions: Int = 1): ReduceNumShufflePartitions = {
-    val conf = new SQLConf().copy(
-      SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE -> advisoryTargetPostShuffleInputSize,
-      SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS -> minNumPostShufflePartitions)
-    ReduceNumShufflePartitions(conf)
-  }
-
-  test("test estimatePartitionStartIndices - 1 Exchange") {
-    val rule = createReduceNumShufflePartitionsRule(100L)
-
-    {
-      // All bytes per partition are 0.
-      val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0)
-      val expectedPartitionStartIndices = Array[Int](0)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-
-    {
-      // Some bytes per partition are 0 and total size is less than the target size.
-      // 1 post-shuffle partition is needed.
-      val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0)
-      val expectedPartitionStartIndices = Array[Int](0)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-
-    {
-      // 2 post-shuffle partitions are needed.
-      val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0)
-      val expectedPartitionStartIndices = Array[Int](0, 3)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-
-    {
-      // There are a few large pre-shuffle partitions.
-      val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-
-    {
-      // All pre-shuffle partitions are larger than the targeted size.
-      val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-
-    {
-      // The last pre-shuffle partition is in a single post-shuffle partition.
-      val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110)
-      val expectedPartitionStartIndices = Array[Int](0, 4)
-      checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices)
-    }
-  }
-
-  test("test estimatePartitionStartIndices - 2 Exchanges") {
-    val rule = createReduceNumShufflePartitionsRule(100L)
-
-    {
-      // If there are multiple values of the number of pre-shuffle partitions,
-      // we should see an assertion error.
-      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
-      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0)
-      val mapOutputStatistics =
-        Array(
-          new MapOutputStatistics(0, bytesByPartitionId1),
-          new MapOutputStatistics(1, bytesByPartitionId2))
-      intercept[AssertionError](rule.estimatePartitionStartAndEndIndices(
-        mapOutputStatistics))
-    }
-
-    {
-      // All bytes per partition are 0.
-      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
-      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
-      val expectedPartitionStartIndices = Array[Int](0)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // Some bytes per partition are 0.
-      // 1 post-shuffle partition is needed.
-      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20)
-      val expectedPartitionStartIndices = Array[Int](0)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // 2 post-shuffle partition are needed.
-      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
-      val expectedPartitionStartIndices = Array[Int](0, 2, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // 4 post-shuffle partition are needed.
-      val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // 2 post-shuffle partition are needed.
-      val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // There are a few large pre-shuffle partitions.
-      val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // All pairs of pre-shuffle partitions are larger than the targeted size.
-      val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0)
-      val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-  }
-
-  test("test estimatePartitionStartIndices and enforce minimal number of reducers") {
-    val rule = createReduceNumShufflePartitionsRule(100L, 2)
-
-    {
-      // The minimal number of post-shuffle partitions is not enforced because
-      // the size of data is 0.
-      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
-      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
-      val expectedPartitionStartIndices = Array[Int](0)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // The minimal number of post-shuffle partitions is enforced.
-      val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20)
-      val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5)
-      val expectedPartitionStartIndices = Array[Int](0, 3)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-
-    {
-      // The number of post-shuffle partitions is determined by the coordinator.
-      val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20)
-      val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30)
-      val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4)
-      checkEstimation(
-        rule,
-        Array(bytesByPartitionId1, bytesByPartitionId2),
-        expectedPartitionStartIndices)
-    }
-  }
-
-  ///////////////////////////////////////////////////////////////////////////
-  // Query tests
-  ///////////////////////////////////////////////////////////////////////////
-
   val numInputPartitions: Int = 10
 
   def withSparkSession(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala
new file mode 100644
index 0000000000000..fcfde83b2ffd5
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.{MapOutputStatistics, SparkFunSuite}
+import org.apache.spark.sql.execution.adaptive.ShufflePartitionsCoalescer
+
+class ShufflePartitionsCoalescerSuite extends SparkFunSuite {
+
+  private def checkEstimation(
+      bytesByPartitionIdArray: Array[Array[Long]],
+      expectedPartitionStartIndices: Array[Int],
+      targetSize: Long,
+      minNumPartitions: Int = 1): Unit = {
+    val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map {
+      case (bytesByPartitionId, index) =>
+        new MapOutputStatistics(index, bytesByPartitionId)
+    }
+    val estimatedPartitionStartIndices = ShufflePartitionsCoalescer.coalescePartitions(
+      mapOutputStatistics,
+      0,
+      bytesByPartitionIdArray.head.length,
+      targetSize,
+      minNumPartitions)
+    assert(estimatedPartitionStartIndices === expectedPartitionStartIndices)
+  }
+
+  test("1 shuffle") {
+    val targetSize = 100
+
+    {
+      // All bytes per partition are 0.
+      val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+
+    {
+      // Some bytes per partition are 0 and total size is less than the target size.
+      // 1 coalesced partition is expected.
+      val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+
+    {
+      // 2 coalesced partitions are expected.
+      val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0)
+      val expectedPartitionStartIndices = Array[Int](0, 3)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+
+    {
+      // There are a few large shuffle partitions.
+      val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+
+    {
+      // All shuffle partitions are larger than the targeted size.
+      val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+
+    {
+      // The last shuffle partition is in a single coalesced partition.
+      val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 4)
+      checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize)
+    }
+  }
+
+  test("2 shuffles") {
+    val targetSize = 100
+
+    {
+      // If there are multiple values of the number of shuffle partitions,
+      // we should see an assertion error.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0)
+      intercept[AssertionError] {
+        checkEstimation(Array(bytesByPartitionId1, bytesByPartitionId2), Array.empty, targetSize)
+      }
+    }
+
+    {
+      // All bytes per partition are 0.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // Some bytes per partition are 0.
+      // 1 coalesced partition is expected.
+      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // 2 coalesced partition are expected.
+      val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 2, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // 4 coalesced partition are expected.
+      val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // 2 coalesced partition are needed.
+      val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // There are a few large shuffle partitions.
+      val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+
+    {
+      // All pairs of shuffle partitions are larger than the targeted size.
+      val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0)
+      val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize)
+    }
+  }
+
+  test("enforce minimal number of coalesced partitions") {
+    val targetSize = 100
+    val minNumPartitions = 2
+
+    {
+      // The minimal number of coalesced partitions is not enforced because
+      // the size of data is 0.
+      val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0)
+      val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0)
+      val expectedPartitionStartIndices = Array[Int](0)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize, minNumPartitions)
+    }
+
+    {
+      // The minimal number of coalesced partitions is enforced.
+      val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20)
+      val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5)
+      val expectedPartitionStartIndices = Array[Int](0, 3)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize, minNumPartitions)
+    }
+
+    {
+      // The number of coalesced partitions is determined by the algorithm.
+      val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20)
+      val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30)
+      val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4)
+      checkEstimation(
+        Array(bytesByPartitionId1, bytesByPartitionId2),
+        expectedPartitionStartIndices,
+        targetSize, minNumPartitions)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index a2071903bea7e..4edb35ea30fde 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -23,7 +23,7 @@ import java.net.URI
 import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart}
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.execution.{ReusedSubqueryExec, SparkPlan}
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildRight, SortMergeJoinExec}
 import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate
 import org.apache.spark.sql.internal.SQLConf
@@ -594,160 +594,84 @@ class AdaptiveQueryExecSuite
           .range(0, 1000, 1, 10)
           .selectExpr("id % 1 as key2", "id as value2")
           .createOrReplaceTempView("skewData2")
-        val (innerPlan, innerAdaptivePlan) = runAdaptiveAndVerifyResult(
+        val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult(
           "SELECT key1 FROM skewData1 join skewData2 ON key1 = key2 group by key1")
-        val innerSmj = findTopLevelSortMergeJoin(innerPlan)
-        assert(innerSmj.size == 1)
         // Additional shuffle introduced, so disable the "OptimizeSkewedJoin" optimization
-        val innerSmjAfter = findTopLevelSortMergeJoin(innerAdaptivePlan)
-        assert(innerSmjAfter.size == 1)
+        val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan)
+        assert(innerSmj.size == 1 && !innerSmj.head.isSkewJoin)
       }
     }
   }
 
+  // TODO: we need a way to customize data distribution after shuffle, to improve test coverage
+  //       of this case.
   test("SPARK-29544: adaptive skew join with different join types") {
-    Seq("false", "true").foreach { reducePostShufflePartitionsEnabled =>
-      withSQLConf(
-        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
-        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-        SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100",
-        SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key -> reducePostShufflePartitionsEnabled,
-        SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") {
-        withTempView("skewData1", "skewData2") {
-          spark
-            .range(0, 1000, 1, 10)
-            .selectExpr("id % 2 as key1", "id as value1")
-            .createOrReplaceTempView("skewData1")
-          spark
-            .range(0, 1000, 1, 10)
-            .selectExpr("id % 1 as key2", "id as value2")
-            .createOrReplaceTempView("skewData2")
-          // skewed inner join optimization
-          val (innerPlan, innerAdaptivePlan) = runAdaptiveAndVerifyResult(
-            "SELECT * FROM skewData1 join skewData2 ON key1 = key2")
-          val innerSmj = findTopLevelSortMergeJoin(innerPlan)
-          assert(innerSmj.size == 1)
-          // left stats: [3496, 0, 0, 0, 4014]
-          // right stats:[6292, 0, 0, 0, 0]
-          // the partition 0 in both left and right side are all skewed.
-          // And divide into 5 splits both in left and right (the max splits number).
-          // So there are 5 x 5 smjs for partition 0.
-          // Partition 4 in left side is skewed and is divided into 5 splits.
-          // The right side of partition 4 is not skewed.
-          // So there are 5 smjs for partition 4.
-          // So total (25 + 5 + 1) smjs.
-          // Union
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //             .
-          //             .
-          //             .
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-
-          val innerSmjAfter = findTopLevelSortMergeJoin(innerAdaptivePlan)
-          assert(innerSmjAfter.size == 31)
-
-          // skewed left outer join optimization
-          val (leftPlan, leftAdaptivePlan) = runAdaptiveAndVerifyResult(
-            "SELECT * FROM skewData1 left outer join skewData2 ON key1 = key2")
-          val leftSmj = findTopLevelSortMergeJoin(leftPlan)
-          assert(leftSmj.size == 1)
-          // left stats: [3496, 0, 0, 0, 4014]
-          // right stats:[6292, 0, 0, 0, 0]
-          // The partition 0 in both left and right are all skewed.
-          // The partition 4 in left side is skewed.
-          // But for left outer join, we don't split the right partition even skewed.
-          // So the partition 0 in left side is divided into 5 splits(the max split number).
-          // the partition 4 in left side is divided into 5 splits(the max split number).
-          // So total (5 + 5 + 1) smjs.
-          // Union
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //             .
-          //             .
-          //             .
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-
-          val leftSmjAfter = findTopLevelSortMergeJoin(leftAdaptivePlan)
-          assert(leftSmjAfter.size == 11)
-
-          // skewed right outer join optimization
-          val (rightPlan, rightAdaptivePlan) = runAdaptiveAndVerifyResult(
-            "SELECT * FROM skewData1 right outer join skewData2 ON key1 = key2")
-          val rightSmj = findTopLevelSortMergeJoin(rightPlan)
-          assert(rightSmj.size == 1)
-          // left stats: [3496, 0, 0, 0, 4014]
-          // right stats:[6292, 0, 0, 0, 0]
-          // The partition 0 in both left and right side are all skewed.
-          // And the partition 4 in left side is skewed.
-          // But for right outer join, we don't split the left partition even skewed.
-          // And divide right side into 5 splits(the max split number)
-          // So total 6 smjs.
-          // Union
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- CoalescedShuffleReader
-          //       +- ShuffleQueryStage
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //             .
-          //             .
-          //             .
-          // +- SortMergeJoin
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-          //   +- Sort
-          //     +- SkewedShuffleReader
-          //       +- ShuffleQueryStage
-
-          val rightSmjAfter = findTopLevelSortMergeJoin(rightAdaptivePlan)
-          assert(rightSmjAfter.size == 6)
+    withSQLConf(
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100",
+      SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") {
+      withTempView("skewData1", "skewData2") {
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 2 as key1", "id as value1")
+          .createOrReplaceTempView("skewData1")
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 1 as key2", "id as value2")
+          .createOrReplaceTempView("skewData2")
+
+        def checkSkewJoin(joins: Seq[SortMergeJoinExec], expectedNumPartitions: Int): Unit = {
+          assert(joins.size == 1 && joins.head.isSkewJoin)
+          assert(joins.head.left.collect {
+            case r: SkewJoinShuffleReaderExec => r
+          }.head.partitionSpecs.length == expectedNumPartitions)
+          assert(joins.head.right.collect {
+            case r: SkewJoinShuffleReaderExec => r
+          }.head.partitionSpecs.length == expectedNumPartitions)
         }
+
+        // skewed inner join optimization
+        val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult(
+          "SELECT * FROM skewData1 join skewData2 ON key1 = key2")
+        // left stats: [3496, 0, 0, 0, 4014]
+        // right stats:[6292, 0, 0, 0, 0]
+        // Partition 0: both left and right sides are skewed, and divide into 5 splits, so
+        //              5 x 5 sub-partitions.
+        // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
+        // Partition 4: only left side is skewed, and divide into 5 splits, so
+        //              5 sub-partitions.
+        // So total (25 + 1 + 5) partitions.
+        val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan)
+        checkSkewJoin(innerSmj, 25 + 1 + 5)
+
+        // skewed left outer join optimization
+        val (_, leftAdaptivePlan) = runAdaptiveAndVerifyResult(
+          "SELECT * FROM skewData1 left outer join skewData2 ON key1 = key2")
+        // left stats: [3496, 0, 0, 0, 4014]
+        // right stats:[6292, 0, 0, 0, 0]
+        // Partition 0: both left and right sides are skewed, but left join can't split right side,
+        //              so only left side is divided into 5 splits, and thus 5 sub-partitions.
+        // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
+        // Partition 4: only left side is skewed, and divide into 5 splits, so
+        //              5 sub-partitions.
+        // So total (5 + 1 + 5) partitions.
+        val leftSmj = findTopLevelSortMergeJoin(leftAdaptivePlan)
+        checkSkewJoin(leftSmj, 5 + 1 + 5)
+
+        // skewed right outer join optimization
+        val (_, rightAdaptivePlan) = runAdaptiveAndVerifyResult(
+          "SELECT * FROM skewData1 right outer join skewData2 ON key1 = key2")
+        // left stats: [3496, 0, 0, 0, 4014]
+        // right stats:[6292, 0, 0, 0, 0]
+        // Partition 0: both left and right sides are skewed, but right join can't split left side,
+        //              so only right side is divided into 5 splits, and thus 5 sub-partitions.
+        // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
+        // Partition 4: only left side is skewed, but right join can't split left side, so just
+        //              1 partition.
+        // So total (5 + 1 + 1) partitions.
+        val rightSmj = findTopLevelSortMergeJoin(rightAdaptivePlan)
+        checkSkewJoin(rightSmj, 5 + 1 + 1)
       }
     }
   }
@@ -805,3 +729,4 @@ class AdaptiveQueryExecSuite
         s" enabled but is not supported for")))
   }
 }
+

From 859699135cb63b57f5d844e762070065cedb4408 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Thu, 13 Feb 2020 11:17:27 -0800
Subject: [PATCH 038/185] [SPARK-30807][K8S][TESTS] Support Java 11 in K8S
 integration tests

### What changes were proposed in this pull request?

This PR aims to support JDK11 test in K8S integration tests.
- This is an update in testing framework instead of individual tests.
- This will enable JDK11 runtime test when you didn't installed JDK11 on your local system.

### Why are the changes needed?

Apache Spark 3.0.0 adds JDK11 support, but K8s integration tests use JDK8 until now.

### Does this PR introduce any user-facing change?

No. This is a dev-only test-related PR.

### How was this patch tested?

This is irrelevant to Jenkins UT, but Jenkins K8S IT (JDK8) should pass.
- https://github.com/apache/spark/pull/27559#issuecomment-585903489 (JDK8 Passed)

And, manually do the following for JDK11 test.
```
$ NO_MANUAL=1 ./dev/make-distribution.sh --r --pip --tgz -Phadoop-3.2 -Pkubernetes
$ resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh --java-image-tag 11-jre-slim --spark-tgz $PWD/spark-*.tgz
```

```
$ docker run -it --rm kubespark/spark:1318DD8A-2B15-4A00-BC69-D0E90CED235B /usr/local/openjdk-11/bin/java --version | tail -n1
OpenJDK 64-Bit Server VM 18.9 (build 11.0.6+10, mixed mode)
```

Closes #27559 from dongjoon-hyun/SPARK-30807.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../docker/src/main/dockerfiles/spark/Dockerfile  |  3 ++-
 .../kubernetes/integration-tests/README.md        | 15 +++++++++++++--
 .../dev/dev-run-integration-tests.sh              | 10 ++++++++++
 .../kubernetes/integration-tests/pom.xml          |  4 ++++
 .../scripts/setup-integration-test-env.sh         | 14 +++++++++++---
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index a1fc63789bc61..6ed37fc637b31 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -14,8 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+ARG java_image_tag=8-jre-slim
 
-FROM openjdk:8-jre-slim
+FROM openjdk:${java_image_tag}
 
 ARG spark_uid=185
 
diff --git a/resource-managers/kubernetes/integration-tests/README.md b/resource-managers/kubernetes/integration-tests/README.md
index d7ad35a175a61..18b91916208d6 100644
--- a/resource-managers/kubernetes/integration-tests/README.md
+++ b/resource-managers/kubernetes/integration-tests/README.md
@@ -6,13 +6,17 @@ title: Spark on Kubernetes Integration Tests
 # Running the Kubernetes Integration Tests
 
 Note that the integration test framework is currently being heavily revised and
-is subject to change. Note that currently the integration tests only run with Java 8.
+is subject to change.
 
 The simplest way to run the integration tests is to install and run Minikube, then run the following from this
 directory:
 
     ./dev/dev-run-integration-tests.sh
 
+To run tests with Java 11 instead of Java 8, use `--java-image-tag` to specify the base image.
+
+    ./dev/dev-run-integration-tests.sh --java-image-tag 11-jre-slim
+
 The minimum tested version of Minikube is 0.23.0. The kube-dns addon must be enabled. Minikube should
 run with a minimum of 4 CPUs and 6G of memory:
 
@@ -183,7 +187,14 @@ to the wrapper scripts and using the wrapper scripts will simply set these appro
       A specific image tag to use, when set assumes images with those tags are already built and available in the 
       specified image repository.  When set to <code>N/A</code> (the default) fresh images will be built.
     </td>
-    <td><code>N/A</code>
+    <td><code>N/A</code></td>
+  </tr>
+  <tr>
+    <td><code>spark.kubernetes.test.javaImageTag</code></td>
+    <td>
+      A specific OpenJDK base image tag to use, when set uses it instead of 8-jre-slim.
+    </td>
+    <td><code>8-jre-slim</code></td>
   </tr>
   <tr>
     <td><code>spark.kubernetes.test.imageTagFile</code></td>
diff --git a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
index 1f0a8035cea7b..76d6e1c1e8499 100755
--- a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
+++ b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
@@ -23,6 +23,7 @@ DEPLOY_MODE="minikube"
 IMAGE_REPO="docker.io/kubespark"
 SPARK_TGZ="N/A"
 IMAGE_TAG="N/A"
+JAVA_IMAGE_TAG=
 BASE_IMAGE_NAME=
 JVM_IMAGE_NAME=
 PYTHON_IMAGE_NAME=
@@ -52,6 +53,10 @@ while (( "$#" )); do
       IMAGE_TAG="$2"
       shift
       ;;
+    --java-image-tag)
+      JAVA_IMAGE_TAG="$2"
+      shift
+      ;;
     --deploy-mode)
       DEPLOY_MODE="$2"
       shift
@@ -120,6 +125,11 @@ properties=(
   -Dtest.include.tags=$INCLUDE_TAGS
 )
 
+if [ -n "$JAVA_IMAGE_TAG" ];
+then
+  properties=( ${properties[@]} -Dspark.kubernetes.test.javaImageTag=$JAVA_IMAGE_TAG )
+fi
+
 if [ -n $NAMESPACE ];
 then
   properties=( ${properties[@]} -Dspark.kubernetes.test.namespace=$NAMESPACE )
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 8e1043f77db6d..369dfd491826c 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -39,6 +39,7 @@
     <spark.kubernetes.test.sparkTgz></spark.kubernetes.test.sparkTgz>
     <spark.kubernetes.test.unpackSparkDir>${project.build.directory}/spark-dist-unpacked</spark.kubernetes.test.unpackSparkDir>
     <spark.kubernetes.test.imageTag>N/A</spark.kubernetes.test.imageTag>
+    <spark.kubernetes.test.javaImageTag>8-jre-slim</spark.kubernetes.test.javaImageTag>
     <spark.kubernetes.test.imageTagFile>${project.build.directory}/imageTag.txt</spark.kubernetes.test.imageTagFile>
     <spark.kubernetes.test.deployMode>minikube</spark.kubernetes.test.deployMode>
     <spark.kubernetes.test.imageRepo>docker.io/kubespark</spark.kubernetes.test.imageRepo>
@@ -109,6 +110,9 @@
                 <argument>--image-tag</argument>
                 <argument>${spark.kubernetes.test.imageTag}</argument>
 
+                <argument>--java-image-tag</argument>
+                <argument>${spark.kubernetes.test.javaImageTag}</argument>
+
                 <argument>--image-tag-output-file</argument>
                 <argument>${spark.kubernetes.test.imageTagFile}</argument>
 
diff --git a/resource-managers/kubernetes/integration-tests/scripts/setup-integration-test-env.sh b/resource-managers/kubernetes/integration-tests/scripts/setup-integration-test-env.sh
index 9e04b963fc40e..ab906604fce06 100755
--- a/resource-managers/kubernetes/integration-tests/scripts/setup-integration-test-env.sh
+++ b/resource-managers/kubernetes/integration-tests/scripts/setup-integration-test-env.sh
@@ -23,6 +23,7 @@ IMAGE_TAG_OUTPUT_FILE="$TEST_ROOT_DIR/target/image-tag.txt"
 DEPLOY_MODE="minikube"
 IMAGE_REPO="docker.io/kubespark"
 IMAGE_TAG="N/A"
+JAVA_IMAGE_TAG="8-jre-slim"
 SPARK_TGZ="N/A"
 
 # Parse arguments
@@ -40,6 +41,10 @@ while (( "$#" )); do
       IMAGE_TAG="$2"
       shift
       ;;
+    --java-image-tag)
+      JAVA_IMAGE_TAG="$2"
+      shift
+      ;;
     --image-tag-output-file)
       IMAGE_TAG_OUTPUT_FILE="$2"
       shift
@@ -82,6 +87,9 @@ then
   IMAGE_TAG=$(uuidgen);
   cd $SPARK_INPUT_DIR
 
+  # OpenJDK base-image tag (e.g. 8-jre-slim, 11-jre-slim)
+  JAVA_IMAGE_TAG_BUILD_ARG="-b java_image_tag=$JAVA_IMAGE_TAG"
+
   # Build PySpark image
   LANGUAGE_BINDING_BUILD_ARGS="-p $DOCKER_FILE_BASE_PATH/bindings/python/Dockerfile"
 
@@ -95,7 +103,7 @@ then
   case $DEPLOY_MODE in
     cloud)
       # Build images
-      $SPARK_INPUT_DIR/bin/docker-image-tool.sh -r $IMAGE_REPO -t $IMAGE_TAG $LANGUAGE_BINDING_BUILD_ARGS build
+      $SPARK_INPUT_DIR/bin/docker-image-tool.sh -r $IMAGE_REPO -t $IMAGE_TAG $JAVA_IMAGE_TAG_BUILD_ARG $LANGUAGE_BINDING_BUILD_ARGS build
 
       # Push images appropriately
       if [[ $IMAGE_REPO == gcr.io* ]] ;
@@ -109,13 +117,13 @@ then
     docker-for-desktop)
        # Only need to build as this will place it in our local Docker repo which is all
        # we need for Docker for Desktop to work so no need to also push
-       $SPARK_INPUT_DIR/bin/docker-image-tool.sh -r $IMAGE_REPO -t $IMAGE_TAG $LANGUAGE_BINDING_BUILD_ARGS build
+       $SPARK_INPUT_DIR/bin/docker-image-tool.sh -r $IMAGE_REPO -t $IMAGE_TAG $JAVA_IMAGE_TAG_BUILD_ARG $LANGUAGE_BINDING_BUILD_ARGS build
        ;;
 
     minikube)
        # Only need to build and if we do this with the -m option for minikube we will
        # build the images directly using the minikube Docker daemon so no need to push
-       $SPARK_INPUT_DIR/bin/docker-image-tool.sh -m -r $IMAGE_REPO -t $IMAGE_TAG $LANGUAGE_BINDING_BUILD_ARGS build
+       $SPARK_INPUT_DIR/bin/docker-image-tool.sh -m -r $IMAGE_REPO -t $IMAGE_TAG $JAVA_IMAGE_TAG_BUILD_ARG $LANGUAGE_BINDING_BUILD_ARGS build
        ;;
     *)
        echo "Unrecognized deploy mode $DEPLOY_MODE" && exit 1

From 74cd46eb691be5dc1cb1c496eeeaa2614945bd98 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Thu, 13 Feb 2020 11:42:00 -0800
Subject: [PATCH 039/185] [SPARK-30816][K8S][TESTS] Fix
 dev-run-integration-tests.sh to ignore empty params

### What changes were proposed in this pull request?

This PR aims to fix `dev-run-integration-tests.sh` to ignore empty params correctly.

### Why are the changes needed?

The following script runs `mvn` integration test like the following.
```
$ resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
...
build/mvn integration-test
-f /Users/dongjoon/APACHE/spark/pom.xml
-pl resource-managers/kubernetes/integration-tests
-am
-Pscala-2.12
-Pkubernetes
-Pkubernetes-integration-tests
-Djava.version=8
-Dspark.kubernetes.test.sparkTgz=N/A
-Dspark.kubernetes.test.imageTag=N/A
-Dspark.kubernetes.test.imageRepo=docker.io/kubespark
-Dspark.kubernetes.test.deployMode=minikube
-Dtest.include.tags=k8s
-Dspark.kubernetes.test.namespace=
-Dspark.kubernetes.test.serviceAccountName=
-Dspark.kubernetes.test.kubeConfigContext=
-Dspark.kubernetes.test.master=
-Dtest.exclude.tags=
-Dspark.kubernetes.test.jvmImage=spark
-Dspark.kubernetes.test.pythonImage=spark-py
-Dspark.kubernetes.test.rImage=spark-r
```

After this PR, the empty parameters like the followings will be skipped like the original design.
```
-Dspark.kubernetes.test.namespace=
-Dspark.kubernetes.test.serviceAccountName=
-Dspark.kubernetes.test.kubeConfigContext=
-Dspark.kubernetes.test.master=
-Dtest.exclude.tags=
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass the Jenkins K8S integration test.

Closes #27566 from dongjoon-hyun/SPARK-30816.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../integration-tests/dev/dev-run-integration-tests.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
index 76d6e1c1e8499..607bb243458a6 100755
--- a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
+++ b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
@@ -130,27 +130,27 @@ then
   properties=( ${properties[@]} -Dspark.kubernetes.test.javaImageTag=$JAVA_IMAGE_TAG )
 fi
 
-if [ -n $NAMESPACE ];
+if [ -n "$NAMESPACE" ];
 then
   properties=( ${properties[@]} -Dspark.kubernetes.test.namespace=$NAMESPACE )
 fi
 
-if [ -n $SERVICE_ACCOUNT ];
+if [ -n "$SERVICE_ACCOUNT" ];
 then
   properties=( ${properties[@]} -Dspark.kubernetes.test.serviceAccountName=$SERVICE_ACCOUNT )
 fi
 
-if [ -n $CONTEXT ];
+if [ -n "$CONTEXT" ];
 then
   properties=( ${properties[@]} -Dspark.kubernetes.test.kubeConfigContext=$CONTEXT )
 fi
 
-if [ -n $SPARK_MASTER ];
+if [ -n "$SPARK_MASTER" ];
 then
   properties=( ${properties[@]} -Dspark.kubernetes.test.master=$SPARK_MASTER )
 fi
 
-if [ -n $EXCLUDE_TAGS ];
+if [ -n "$EXCLUDE_TAGS" ];
 then
   properties=( ${properties[@]} -Dtest.exclude.tags=$EXCLUDE_TAGS )
 fi

From e2d3983de78f5c80fac066b7ee8bedd0987110dd Mon Sep 17 00:00:00 2001
From: Ali Afroozeh <ali.afroozeh@databricks.com>
Date: Thu, 13 Feb 2020 23:58:55 +0100
Subject: [PATCH 040/185] [SPARK-30798][SQL] Scope Session.active in
 QueryExecution

### What changes were proposed in this pull request?

This PR scopes `SparkSession.active` to prevent problems with processing queries with possibly different spark sessions (and different configs). A new method, `withActive` is introduced on `SparkSession` that restores the previous spark session after the block of code is executed.

### Why are the changes needed?
`SparkSession.active` is a thread local variable that points to the current thread's spark session. It is important to note that the `SQLConf.get` method depends on `SparkSession.active`. In the current implementation it is possible that `SparkSession.active` points to a different session which causes various problems. Most of these problems arise because part of the query processing is done using the configurations of a different session. For example, when creating a data frame using a new session, i.e., `session.sql("...")`, part of the data frame is constructed using the currently active spark session, which can be a different session from the one used later for processing the query.

### Does this PR introduce any user-facing change?
The `withActive` method is introduced on `SparkSession`.

### How was this patch tested?
Unit tests (to be added)

Closes #27387 from dbaliafroozeh/UseWithActiveSessionInQueryExecution.

Authored-by: Ali Afroozeh <ali.afroozeh@databricks.com>
Signed-off-by: herman <herman@databricks.com>
---
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../apache/spark/sql/DataFrameWriterV2.scala  |  2 +-
 .../scala/org/apache/spark/sql/Dataset.scala  | 36 ++++++++++---------
 .../spark/sql/KeyValueGroupedDataset.scala    |  5 +--
 .../org/apache/spark/sql/SparkSession.scala   | 30 +++++++++++-----
 .../spark/sql/execution/QueryExecution.scala  | 16 +++++----
 .../spark/sql/execution/SQLExecution.scala    |  4 +--
 .../streaming/MicroBatchExecution.scala       |  4 +--
 .../continuous/ContinuousExecution.scala      |  2 +-
 .../spark/sql/internal/CatalogImpl.scala      |  2 +-
 .../org/apache/spark/sql/DatasetSuite.scala   | 10 ++++++
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  2 +-
 .../ui/SQLAppStatusListenerSuite.scala        |  2 +-
 .../SparkExecuteStatementOperation.scala      |  2 +-
 .../hive/thriftserver/SparkSQLDriver.scala    |  2 +-
 .../hive/execution/HiveComparisonTest.scala   |  3 +-
 .../apache/spark/sql/hive/test/TestHive.scala |  2 +-
 17 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 4557219abeb15..fff1f4b636dea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -896,7 +896,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   private def runCommand(session: SparkSession, name: String)(command: LogicalPlan): Unit = {
     val qe = session.sessionState.executePlan(command)
     // call `QueryExecution.toRDD` to trigger the execution of commands.
-    SQLExecution.withNewExecutionId(session, qe, Some(name))(qe.toRdd)
+    SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd)
   }
 
   private def lookupV2Provider(): Option[TableProvider] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
index f5dd7613d4103..cf6bde5a2bcb9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
@@ -226,7 +226,7 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T])
   private def runCommand(name: String)(command: LogicalPlan): Unit = {
     val qe = sparkSession.sessionState.executePlan(command)
     // call `QueryExecution.toRDD` to trigger the execution of commands.
-    SQLExecution.withNewExecutionId(sparkSession, qe, Some(name))(qe.toRdd)
+    SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd)
   }
 
   private def internalReplace(orCreate: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index a1c33f92d17b4..42f35354e864f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -82,18 +82,19 @@ private[sql] object Dataset {
     dataset
   }
 
-  def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = {
-    val qe = sparkSession.sessionState.executePlan(logicalPlan)
-    qe.assertAnalyzed()
-    new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema))
+  def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame =
+    sparkSession.withActive {
+      val qe = sparkSession.sessionState.executePlan(logicalPlan)
+      qe.assertAnalyzed()
+      new Dataset[Row](qe, RowEncoder(qe.analyzed.schema))
   }
 
   /** A variant of ofRows that allows passing in a tracker so we can track query parsing time. */
   def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan, tracker: QueryPlanningTracker)
-    : DataFrame = {
+    : DataFrame = sparkSession.withActive {
     val qe = new QueryExecution(sparkSession, logicalPlan, tracker)
     qe.assertAnalyzed()
-    new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema))
+    new Dataset[Row](qe, RowEncoder(qe.analyzed.schema))
   }
 }
 
@@ -185,13 +186,12 @@ private[sql] object Dataset {
  */
 @Stable
 class Dataset[T] private[sql](
-    @transient private val _sparkSession: SparkSession,
     @DeveloperApi @Unstable @transient val queryExecution: QueryExecution,
     @DeveloperApi @Unstable @transient val encoder: Encoder[T])
   extends Serializable {
 
   @transient lazy val sparkSession: SparkSession = {
-    if (_sparkSession == null) {
+    if (queryExecution == null || queryExecution.sparkSession == null) {
       throw new SparkException(
       "Dataset transformations and actions can only be invoked by the driver, not inside of" +
         " other Dataset transformations; for example, dataset1.map(x => dataset2.values.count()" +
@@ -199,7 +199,7 @@ class Dataset[T] private[sql](
         "performed inside of the dataset1.map transformation. For more information," +
         " see SPARK-28702.")
     }
-    _sparkSession
+    queryExecution.sparkSession
   }
 
   // A globally unique id of this Dataset.
@@ -211,7 +211,7 @@ class Dataset[T] private[sql](
   // you wrap it with `withNewExecutionId` if this actions doesn't call other action.
 
   def this(sparkSession: SparkSession, logicalPlan: LogicalPlan, encoder: Encoder[T]) = {
-    this(sparkSession, sparkSession.sessionState.executePlan(logicalPlan), encoder)
+    this(sparkSession.sessionState.executePlan(logicalPlan), encoder)
   }
 
   def this(sqlContext: SQLContext, logicalPlan: LogicalPlan, encoder: Encoder[T]) = {
@@ -445,7 +445,7 @@ class Dataset[T] private[sql](
    */
   // This is declared with parentheses to prevent the Scala compiler from treating
   // `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
-  def toDF(): DataFrame = new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema))
+  def toDF(): DataFrame = new Dataset[Row](queryExecution, RowEncoder(schema))
 
   /**
    * Returns a new Dataset where each record has been mapped on to the specified type. The
@@ -503,7 +503,9 @@ class Dataset[T] private[sql](
    * @group basic
    * @since 1.6.0
    */
-  def schema: StructType = queryExecution.analyzed.schema
+  def schema: StructType = sparkSession.withActive {
+    queryExecution.analyzed.schema
+  }
 
   /**
    * Prints the schema to the console in a nice tree format.
@@ -539,7 +541,7 @@ class Dataset[T] private[sql](
    * @group basic
    * @since 3.0.0
    */
-  def explain(mode: String): Unit = {
+  def explain(mode: String): Unit = sparkSession.withActive {
     // Because temporary views are resolved during analysis when we create a Dataset, and
     // `ExplainCommand` analyzes input query plan and resolves temporary views again. Using
     // `ExplainCommand` here will probably output different query plans, compared to the results
@@ -1502,7 +1504,7 @@ class Dataset[T] private[sql](
     val namedColumns =
       columns.map(_.withInputType(exprEnc, logicalPlan.output).named)
     val execution = new QueryExecution(sparkSession, Project(namedColumns, logicalPlan))
-    new Dataset(sparkSession, execution, ExpressionEncoder.tuple(encoders))
+    new Dataset(execution, ExpressionEncoder.tuple(encoders))
   }
 
   /**
@@ -3472,7 +3474,7 @@ class Dataset[T] private[sql](
    * an execution.
    */
   private def withNewExecutionId[U](body: => U): U = {
-    SQLExecution.withNewExecutionId(sparkSession, queryExecution)(body)
+    SQLExecution.withNewExecutionId(queryExecution)(body)
   }
 
   /**
@@ -3481,7 +3483,7 @@ class Dataset[T] private[sql](
    * reset.
    */
   private def withNewRDDExecutionId[U](body: => U): U = {
-    SQLExecution.withNewExecutionId(sparkSession, rddQueryExecution) {
+    SQLExecution.withNewExecutionId(rddQueryExecution) {
       rddQueryExecution.executedPlan.resetMetrics()
       body
     }
@@ -3492,7 +3494,7 @@ class Dataset[T] private[sql](
    * user-registered callback functions.
    */
   private def withAction[U](name: String, qe: QueryExecution)(action: SparkPlan => U) = {
-    SQLExecution.withNewExecutionId(sparkSession, qe, Some(name)) {
+    SQLExecution.withNewExecutionId(qe, Some(name)) {
       qe.executedPlan.resetMetrics()
       action(qe.executedPlan)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 89cc9735e4f6a..76ee297dfca79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -449,10 +449,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
     val aggregate = Aggregate(groupingAttributes, keyColumn +: namedColumns, logicalPlan)
     val execution = new QueryExecution(sparkSession, aggregate)
 
-    new Dataset(
-      sparkSession,
-      execution,
-      ExpressionEncoder.tuple(kExprEnc +: encoders))
+    new Dataset(execution, ExpressionEncoder.tuple(kExprEnc +: encoders))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index abefb348cafc7..1fb97fb4b4cf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -293,8 +293,7 @@ class SparkSession private(
    *
    * @since 2.0.0
    */
-  def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
-    SparkSession.setActiveSession(this)
+  def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = withActive {
     val encoder = Encoders.product[A]
     Dataset.ofRows(self, ExternalRDD(rdd, self)(encoder))
   }
@@ -304,8 +303,7 @@ class SparkSession private(
    *
    * @since 2.0.0
    */
-  def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
-    SparkSession.setActiveSession(this)
+  def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = withActive {
     val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
     val attributeSeq = schema.toAttributes
     Dataset.ofRows(self, LocalRelation.fromProduct(attributeSeq, data))
@@ -343,7 +341,7 @@ class SparkSession private(
    * @since 2.0.0
    */
   @DeveloperApi
-  def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = {
+  def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = withActive {
     // TODO: use MutableProjection when rowRDD is another DataFrame and the applied
     // schema differs from the existing schema on any field data type.
     val encoder = RowEncoder(schema)
@@ -373,7 +371,7 @@ class SparkSession private(
    * @since 2.0.0
    */
   @DeveloperApi
-  def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = {
+  def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = withActive {
     Dataset.ofRows(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala))
   }
 
@@ -385,7 +383,7 @@ class SparkSession private(
    *
    * @since 2.0.0
    */
-  def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
+  def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = withActive {
     val attributeSeq: Seq[AttributeReference] = getSchema(beanClass)
     val className = beanClass.getName
     val rowRdd = rdd.mapPartitions { iter =>
@@ -414,7 +412,7 @@ class SparkSession private(
    *          SELECT * queries will return the columns in an undefined order.
    * @since 1.6.0
    */
-  def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = {
+  def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = withActive {
     val attrSeq = getSchema(beanClass)
     val rows = SQLContext.beansToRows(data.asScala.iterator, beanClass, attrSeq)
     Dataset.ofRows(self, LocalRelation(attrSeq, rows.toSeq))
@@ -599,7 +597,7 @@ class SparkSession private(
    *
    * @since 2.0.0
    */
-  def sql(sqlText: String): DataFrame = {
+  def sql(sqlText: String): DataFrame = withActive {
     val tracker = new QueryPlanningTracker
     val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
       sessionState.sqlParser.parsePlan(sqlText)
@@ -751,6 +749,20 @@ class SparkSession private(
     }
   }
 
+  /**
+   * Execute a block of code with the this session set as the active session, and restore the
+   * previous session on completion.
+   */
+  private[sql] def withActive[T](block: => T): T = {
+    // Use the active session thread local directly to make sure we get the session that is actually
+    // set and not the default session. This to prevent that we promote the default session to the
+    // active session once we are done.
+    val old = SparkSession.activeThreadSession.get()
+    SparkSession.setActiveSession(this)
+    try block finally {
+      SparkSession.setActiveSession(old)
+    }
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 38ef66682c413..53b6b5d82c021 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -63,13 +63,12 @@ class QueryExecution(
     }
   }
 
-  lazy val analyzed: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.ANALYSIS) {
-    SparkSession.setActiveSession(sparkSession)
+  lazy val analyzed: LogicalPlan = executePhase(QueryPlanningTracker.ANALYSIS) {
     // We can't clone `logical` here, which will reset the `_analyzed` flag.
     sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker)
   }
 
-  lazy val withCachedData: LogicalPlan = {
+  lazy val withCachedData: LogicalPlan = sparkSession.withActive {
     assertAnalyzed()
     assertSupported()
     // clone the plan to avoid sharing the plan instance between different stages like analyzing,
@@ -77,20 +76,20 @@ class QueryExecution(
     sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone())
   }
 
-  lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) {
+  lazy val optimizedPlan: LogicalPlan = executePhase(QueryPlanningTracker.OPTIMIZATION) {
     // clone the plan to avoid sharing the plan instance between different stages like analyzing,
     // optimizing and planning.
     sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker)
   }
 
-  lazy val sparkPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) {
+  lazy val sparkPlan: SparkPlan = executePhase(QueryPlanningTracker.PLANNING) {
     // Clone the logical plan here, in case the planner rules change the states of the logical plan.
     QueryExecution.createSparkPlan(sparkSession, planner, optimizedPlan.clone())
   }
 
   // executedPlan should not be used to initialize any SparkPlan. It should be
   // only used for execution.
-  lazy val executedPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) {
+  lazy val executedPlan: SparkPlan = executePhase(QueryPlanningTracker.PLANNING) {
     // clone the plan to avoid sharing the plan instance between different stages like analyzing,
     // optimizing and planning.
     QueryExecution.prepareForExecution(preparations, sparkPlan.clone())
@@ -116,6 +115,10 @@ class QueryExecution(
     QueryExecution.preparations(sparkSession)
   }
 
+  private def executePhase[T](phase: String)(block: => T): T = sparkSession.withActive {
+    tracker.measurePhase(phase)(block)
+  }
+
   def simpleString: String = simpleString(false)
 
   def simpleString(formatted: Boolean): String = withRedaction {
@@ -305,7 +308,6 @@ object QueryExecution {
       sparkSession: SparkSession,
       planner: SparkPlanner,
       plan: LogicalPlan): SparkPlan = {
-    SparkSession.setActiveSession(sparkSession)
     // TODO: We use next(), i.e. take the first plan returned by the planner, here for now,
     //       but we will implement to choose the best plan.
     planner.plan(ReturnAnswer(plan)).next()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index 9f177819f6ea7..59c503e372535 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -62,9 +62,9 @@ object SQLExecution {
    * we can connect them with an execution.
    */
   def withNewExecutionId[T](
-      sparkSession: SparkSession,
       queryExecution: QueryExecution,
-      name: Option[String] = None)(body: => T): T = {
+      name: Option[String] = None)(body: => T): T = queryExecution.sparkSession.withActive {
+    val sparkSession = queryExecution.sparkSession
     val sc = sparkSession.sparkContext
     val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY)
     val executionId = SQLExecution.nextExecutionId
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 83bc347e23ed4..45a2ce16183a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -563,11 +563,11 @@ class MicroBatchExecution(
     }
 
     val nextBatch =
-      new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema))
+      new Dataset(lastExecution, RowEncoder(lastExecution.analyzed.schema))
 
     val batchSinkProgress: Option[StreamWriterCommitProgress] =
       reportTimeTaken("addBatch") {
-      SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
+      SQLExecution.withNewExecutionId(lastExecution) {
         sink match {
           case s: Sink => s.addBatch(currentBatchId, nextBatch)
           case _: SupportsWrite =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index a9b724a73a18e..a109c2171f3d2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -252,7 +252,7 @@ class ContinuousExecution(
 
       updateStatusMessage("Running")
       reportTimeTaken("runContinuous") {
-        SQLExecution.withNewExecutionId(sparkSessionForQuery, lastExecution) {
+        SQLExecution.withNewExecutionId(lastExecution) {
           lastExecution.executedPlan.execute()
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 3740b56cb9cbb..d3ef03e9b3b74 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -520,7 +520,7 @@ private[sql] object CatalogImpl {
     val encoded = data.map(d => enc.toRow(d).copy())
     val plan = new LocalRelation(enc.schema.toAttributes, encoded)
     val queryExecution = sparkSession.sessionState.executePlan(plan)
-    new Dataset[T](sparkSession, queryExecution, enc)
+    new Dataset[T](queryExecution, enc)
   }
 
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 233d67898f909..b0bd612e88d98 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1899,6 +1899,16 @@ class DatasetSuite extends QueryTest
     val e = intercept[AnalysisException](spark.range(1).tail(-1))
     e.getMessage.contains("tail expression must be equal to or greater than 0")
   }
+
+  test("SparkSession.active should be the same instance after dataset operations") {
+    val active = SparkSession.getActiveSession.get
+    val clone = active.cloneSession()
+    val ds = new Dataset(clone, spark.range(10).queryExecution.logical, Encoders.INT)
+
+    ds.queryExecution.analyzed
+
+    assert(active eq SparkSession.getActiveSession.get)
+  }
 }
 
 object AssertExecutionId {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index da4727f6a98cb..83285911b3948 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -511,7 +511,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
     val df = session.sql(sql)
     val schema = df.schema.catalogString
     // Get answer, but also get rid of the #1234 expression ids that show up in explain plans
-    val answer = SQLExecution.withNewExecutionId(session, df.queryExecution, Some(sql)) {
+    val answer = SQLExecution.withNewExecutionId(df.queryExecution, Some(sql)) {
       hiveResultString(df.queryExecution.executedPlan).map(replaceNotIncludedMsg)
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index fdfd392a224cb..d18a35c3110f9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -508,7 +508,7 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils
       override lazy val executedPlan = physicalPlan
     }
 
-    SQLExecution.withNewExecutionId(spark, dummyQueryExecution) {
+    SQLExecution.withNewExecutionId(dummyQueryExecution) {
       physicalPlan.execute().collect()
     }
 
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 76d07848f79a9..cf0e5ebf3a2b1 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -295,7 +295,7 @@ private[hive] class SparkExecuteStatementOperation(
           resultList.get.iterator
         }
       }
-      dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray
+      dataTypes = result.schema.fields.map(_.dataType)
     } catch {
       // Actually do need to catch Throwable as some failures don't inherit from Exception and
       // HiveServer will silently swallow them.
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 362ac362e9718..12fba0eae6dce 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -61,7 +61,7 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont
     try {
       context.sparkContext.setJobDescription(command)
       val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
-      hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) {
+      hiveResponse = SQLExecution.withNewExecutionId(execution) {
         hiveResultString(execution.executedPlan)
       }
       tableSchema = getResultSetSchema(execution)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 28e1db961f611..8b1f4c92755b9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -346,8 +346,7 @@ abstract class HiveComparisonTest
         val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) =>
           val query = new TestHiveQueryExecution(queryString.replace("../../data", testDataPath))
           def getResult(): Seq[String] = {
-            SQLExecution.withNewExecutionId(
-              query.sparkSession, query)(hiveResultString(query.executedPlan))
+            SQLExecution.withNewExecutionId(query)(hiveResultString(query.executedPlan))
           }
           try { (query, prepareAnswer(query, getResult())) } catch {
             case e: Throwable =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
index cc4592a5caf68..222244a04f5f5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -501,7 +501,7 @@ private[hive] class TestHiveSparkSession(
       // has already set the execution id.
       if (sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) {
         // We don't actually have a `QueryExecution` here, use a fake one instead.
-        SQLExecution.withNewExecutionId(this, new QueryExecution(this, OneRowRelation())) {
+        SQLExecution.withNewExecutionId(new QueryExecution(this, OneRowRelation())) {
           createCmds.foreach(_())
         }
       } else {

From 57254c9719f9af9ad985596ed7fbbaafa4052002 Mon Sep 17 00:00:00 2001
From: sarthfrey-db <sarth.frey@databricks.com>
Date: Thu, 13 Feb 2020 16:15:00 -0800
Subject: [PATCH 041/185] [SPARK-30667][CORE] Add allGather method to
 BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
...     context = BarrierTaskContext.get()
...     return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db <sarth.frey@databricks.com>
Co-authored-by: sarthfrey <sarth.frey@gmail.com>
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 .../org/apache/spark/BarrierCoordinator.scala | 113 +++++++++++--
 .../org/apache/spark/BarrierTaskContext.scala | 153 ++++++++++++------
 .../spark/api/python/PythonRunner.scala       |  51 ++++--
 .../scheduler/BarrierTaskContextSuite.scala   |  74 +++++++++
 python/pyspark/taskcontext.py                 |  49 +++++-
 python/pyspark/tests/test_taskcontext.py      |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e417679ca663..042a2664a0e27 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
     // reset when a barrier() call fails due to timeout.
     private var barrierEpoch: Int = 0
 
-    // An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier()
-    // call.
+    // An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call
     private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks)
 
+    // An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call
+    private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer]
+
+    // The blocking requestMethod called by tasks to sync up for this stage attempt
+    private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER
+
     // A timer task that ensures we may timeout for a barrier() call.
     private var timerTask: TimerTask = null
 
@@ -130,9 +140,32 @@ private[spark] class BarrierCoordinator(
 
     // Process the global sync request. The barrier() call succeed if collected enough requests
     // within a configured time, otherwise fail all the pending requests.
-    def handleRequest(requester: RpcCallContext, request: RequestToSync): Unit = synchronized {
+    def handleRequest(
+      requester: RpcCallContext,
+      request: RequestToSync
+    ): Unit = synchronized {
       val taskId = request.taskAttemptId
       val epoch = request.barrierEpoch
+      val requestMethod = request.requestMethod
+      val partitionId = request.partitionId
+      val allGatherMessage = request match {
+        case ag: AllGatherRequestToSync => ag.allGatherMessage
+        case _ => ""
+      }
+
+      if (requesters.size == 0) {
+        requestMethodToSync = requestMethod
+      }
+
+      if (requestMethodToSync != requestMethod) {
+        requesters.foreach(
+          _.sendFailure(new SparkException(s"$barrierId tried to use requestMethod " +
+            s"`$requestMethod` during barrier epoch $barrierEpoch, which does not match " +
+            s"the current synchronized requestMethod `$requestMethodToSync`"
+          ))
+        )
+        cleanupBarrierStage(barrierId)
+      }
 
       // Require the number of tasks is correctly set from the BarrierTaskContext.
       require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " +
@@ -153,6 +186,7 @@ private[spark] class BarrierCoordinator(
         }
         // Add the requester to array of RPCCallContexts pending for reply.
         requesters += requester
+        allGatherMessages(partitionId) = allGatherMessage
         logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " +
           s"$taskId, current progress: ${requesters.size}/$numTasks.")
         if (maybeFinishAllRequesters(requesters, numTasks)) {
@@ -162,6 +196,7 @@ private[spark] class BarrierCoordinator(
             s"tasks, finished successfully.")
           barrierEpoch += 1
           requesters.clear()
+          allGatherMessages.clear()
           cancelTimerTask()
         }
       }
@@ -173,7 +208,13 @@ private[spark] class BarrierCoordinator(
         requesters: ArrayBuffer[RpcCallContext],
         numTasks: Int): Boolean = {
       if (requesters.size == numTasks) {
-        requesters.foreach(_.reply(()))
+        requestMethodToSync match {
+          case RequestMethod.BARRIER =>
+            requesters.foreach(_.reply(""))
+          case RequestMethod.ALL_GATHER =>
+            val json: String = compact(render(allGatherMessages))
+            requesters.foreach(_.reply(json))
+        }
         true
       } else {
         false
@@ -186,6 +227,7 @@ private[spark] class BarrierCoordinator(
       // messages come from current stage attempt shall fail.
       barrierEpoch = -1
       requesters.clear()
+      allGatherMessages.clear()
       cancelTimerTask()
     }
   }
@@ -199,11 +241,11 @@ private[spark] class BarrierCoordinator(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) =>
+    case request: RequestToSync =>
       // Get or init the ContextBarrierState correspond to the stage attempt.
-      val barrierId = ContextBarrierId(stageId, stageAttemptId)
+      val barrierId = ContextBarrierId(request.stageId, request.stageAttemptId)
       states.computeIfAbsent(barrierId,
-        (key: ContextBarrierId) => new ContextBarrierState(key, numTasks))
+        (key: ContextBarrierId) => new ContextBarrierState(key, request.numTasks))
       val barrierState = states.get(barrierId)
 
       barrierState.handleRequest(context, request)
@@ -216,6 +258,16 @@ private[spark] class BarrierCoordinator(
 
 private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
 
+private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
+  def numTasks: Int
+  def stageId: Int
+  def stageAttemptId: Int
+  def taskAttemptId: Long
+  def barrierEpoch: Int
+  def partitionId: Int
+  def requestMethod: RequestMethod.Value
+}
+
 /**
  * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is
  * identified by stageId + stageAttemptId + barrierEpoch.
@@ -224,11 +276,44 @@ private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
  * @param stageId ID of current stage
  * @param stageAttemptId ID of current stage attempt
  * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls.
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
  */
-private[spark] case class RequestToSync(
-    numTasks: Int,
-    stageId: Int,
-    stageAttemptId: Int,
-    taskAttemptId: Long,
-    barrierEpoch: Int) extends BarrierCoordinatorMessage
+private[spark] case class BarrierRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value
+) extends RequestToSync
+
+/**
+ * A global sync request message from BarrierTaskContext, by `allGather()` call. Each request is
+ * identified by stageId + stageAttemptId + barrierEpoch.
+ *
+ * @param numTasks The number of global sync requests the BarrierCoordinator shall receive
+ * @param stageId ID of current stage
+ * @param stageAttemptId ID of current stage attempt
+ * @param taskAttemptId Unique ID of current task
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
+ * @param allGatherMessage Message sent from the BarrierTaskContext if requestMethod is ALL_GATHER
+ */
+private[spark] case class AllGatherRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value,
+  allGatherMessage: String
+) extends RequestToSync
+
+private[spark] object RequestMethod extends Enumeration {
+  val BARRIER, ALL_GATHER = Value
+}
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 3d369802f3023..2263538a11676 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,11 +17,19 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Properties, Timer, TimerTask}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
@@ -59,49 +67,31 @@ class BarrierTaskContext private[spark] (
   // from different tasks within the same barrier stage attempt to succeed.
   private lazy val numTasks = getTaskInfos().size
 
-  /**
-   * :: Experimental ::
-   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
-   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
-   * stage have reached this routine.
-   *
-   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
-   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
-   * timeout. Some examples of '''misuses''' are listed below:
-   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
-   * shall lead to timeout of the function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       if (context.partitionId() == 0) {
-   *           // Do nothing.
-   *       } else {
-   *           context.barrier()
-   *       }
-   *       iter
-   *   }
-   * }}}
-   *
-   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
-   * second function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       try {
-   *           // Do something that might throw an Exception.
-   *           doSomething()
-   *           context.barrier()
-   *       } catch {
-   *           case e: Exception => logWarning("...", e)
-   *       }
-   *       context.barrier()
-   *       iter
-   *   }
-   * }}}
-   */
-  @Experimental
-  @Since("2.4.0")
-  def barrier(): Unit = {
+  private def getRequestToSync(
+    numTasks: Int,
+    stageId: Int,
+    stageAttemptNumber: Int,
+    taskAttemptId: Long,
+    barrierEpoch: Int,
+    partitionId: Int,
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String
+  ): RequestToSync = {
+    requestMethod match {
+      case RequestMethod.BARRIER =>
+        BarrierRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod)
+      case RequestMethod.ALL_GATHER =>
+        AllGatherRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod, allGatherMessage)
+    }
+  }
+
+  private def runBarrier(
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String = ""
+  ): String = {
+
     logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " +
       s"the global sync, current barrier epoch is $barrierEpoch.")
     logTrace("Current callSite: " + Utils.getCallSite())
@@ -118,10 +108,12 @@ class BarrierTaskContext private[spark] (
     // Log the update of global sync every 60 seconds.
     timer.schedule(timerTask, 60000, 60000)
 
+    var json: String = ""
+
     try {
-      val abortableRpcFuture = barrierCoordinator.askAbortable[Unit](
-        message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch),
+      val abortableRpcFuture = barrierCoordinator.askAbortable[String](
+        message = getRequestToSync(numTasks, stageId, stageAttemptNumber,
+          taskAttemptId, barrierEpoch, partitionId, requestMethod, allGatherMessage),
         // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by
         // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework.
         timeout = new RpcTimeout(365.days, "barrierTimeout"))
@@ -133,7 +125,7 @@ class BarrierTaskContext private[spark] (
         while (!abortableRpcFuture.toFuture.isCompleted) {
           // wait RPC future for at most 1 second
           try {
-            ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
+            json = ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
           } catch {
             case _: TimeoutException | _: InterruptedException =>
               // If `TimeoutException` thrown, waiting RPC future reach 1 second.
@@ -163,6 +155,73 @@ class BarrierTaskContext private[spark] (
       timerTask.cancel()
       timer.purge()
     }
+    json
+  }
+
+  /**
+   * :: Experimental ::
+   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
+   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
+   * stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
+   * timeout. Some examples of '''misuses''' are listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       if (context.partitionId() == 0) {
+   *           // Do nothing.
+   *       } else {
+   *           context.barrier()
+   *       }
+   *       iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       try {
+   *           // Do something that might throw an Exception.
+   *           doSomething()
+   *           context.barrier()
+   *       } catch {
+   *           case e: Exception => logWarning("...", e)
+   *       }
+   *       context.barrier()
+   *       iter
+   *   }
+   * }}}
+   */
+  @Experimental
+  @Since("2.4.0")
+  def barrier(): Unit = {
+    runBarrier(RequestMethod.BARRIER)
+    ()
+  }
+
+  /**
+   * :: Experimental ::
+   * Blocks until all tasks in the same stage have reached this routine. Each task passes in
+   * a message and returns with a list of all the messages passed in by each of those tasks.
+   *
+   * CAUTION! The allGather method requires the same precautions as the barrier method
+   *
+   * The message is type String rather than Array[Byte] because it is more convenient for
+   * the user at the cost of worse performance.
+   */
+  @Experimental
+  @Since("3.0.0")
+  def allGather(message: String): ArrayBuffer[String] = {
+    val json = runBarrier(RequestMethod.ALL_GATHER, message)
+    val jsonArray = parse(json)
+    implicit val formats = DefaultFormats
+    ArrayBuffer(jsonArray.extract[Array[String]]: _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 658e0d593a167..fa8bf0fc06358 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,8 +24,13 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
@@ -238,13 +243,18 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
                   sock.setSoTimeout(10000)
                   authHelper.authClient(sock)
                   val input = new DataInputStream(sock.getInputStream())
-                  input.readInt() match {
+                  val requestMethod = input.readInt()
+                  // The BarrierTaskContext function may wait infinitely, socket shall not timeout
+                  // before the function finishes.
+                  sock.setSoTimeout(0)
+                  requestMethod match {
                     case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-                      // The barrier() function may wait infinitely, socket shall not timeout
-                      // before the function finishes.
-                      sock.setSoTimeout(0)
-                      barrierAndServe(sock)
-
+                      barrierAndServe(requestMethod, sock)
+                    case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+                      val length = input.readInt()
+                      val message = new Array[Byte](length)
+                      input.readFully(message)
+                      barrierAndServe(requestMethod, sock, new String(message, UTF_8))
                     case _ =>
                       val out = new DataOutputStream(new BufferedOutputStream(
                         sock.getOutputStream))
@@ -395,15 +405,31 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     }
 
     /**
-     * Gateway to call BarrierTaskContext.barrier().
+     * Gateway to call BarrierTaskContext methods.
      */
-    def barrierAndServe(sock: Socket): Unit = {
-      require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.")
-
+    def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = {
+      require(
+        serverSocket.isDefined,
+        "No available ServerSocket to redirect the BarrierTaskContext method call."
+      )
       val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
       try {
-        context.asInstanceOf[BarrierTaskContext].barrier()
-        writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out)
+        var result: String = ""
+        requestMethod match {
+          case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
+            context.asInstanceOf[BarrierTaskContext].barrier()
+            result = BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS
+          case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+            val messages: ArrayBuffer[String] = context.asInstanceOf[BarrierTaskContext].allGather(
+              message
+            )
+            result = compact(render(JArray(
+              messages.map(
+                (message) => JString(message)
+              ).toList
+            )))
+        }
+        writeUTF(result, out)
       } catch {
         case e: SparkException =>
           writeUTF(e.getMessage, out)
@@ -638,6 +664,7 @@ private[spark] object SpecialLengths {
 
 private[spark] object BarrierTaskContextMessageProtocol {
   val BARRIER_FUNCTION = 1
+  val ALL_GATHER_FUNCTION = 2
   val BARRIER_RESULT_SUCCESS = "success"
   val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side."
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
index fc8ac38479932..ed38b7f0ecac1 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.io.File
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 import org.apache.spark._
@@ -52,6 +53,79 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext {
     assert(times.max - times.min <= 1000)
   }
 
+  test("share messages with allGather() call") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message = context.partitionId().toString
+      val messages = context.allGather(message)
+      messages.toList.iterator
+    }
+    // Take a sorted list of all the partitionId messages
+    val messages = rdd2.collect().head
+    // All the task partitionIds are shared
+    for((x, i) <- messages.view.zipWithIndex) assert(x == i.toString)
+  }
+
+  test("throw exception if we attempt to synchronize with different blocking calls") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      val partitionId = context.partitionId
+      if (partitionId == 0) {
+        context.barrier()
+      } else {
+        context.allGather(partitionId.toString)
+      }
+      Seq(null).iterator
+    }
+    val error = intercept[SparkException] {
+      rdd2.collect()
+    }.getMessage
+    assert(error.contains("does not match the current synchronized requestMethod"))
+  }
+
+  test("successively sync with allGather and barrier") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      context.barrier()
+      val time1 = System.currentTimeMillis()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message = context.partitionId().toString
+      val messages = context.allGather(message)
+      val time2 = System.currentTimeMillis()
+      Seq((time1, time2)).iterator
+    }
+    val times = rdd2.collect()
+    // All the tasks shall finish the first round of global sync within a short time slot.
+    val times1 = times.map(_._1)
+    assert(times1.max - times1.min <= 1000)
+
+    // All the tasks shall finish the second round of global sync within a short time slot.
+    val times2 = times.map(_._2)
+    assert(times2.max - times2.min <= 1000)
+  }
+
   test("support multiple barrier() call within a single task") {
     initLocalClusterSparkContext()
     val rdd = sc.makeRDD(1 to 10, 4)
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index d648f63338514..90bd2345ac525 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,9 +16,10 @@
 #
 
 from __future__ import print_function
+import json
 
 from pyspark.java_gateway import local_connect_and_auth
-from pyspark.serializers import write_int, UTF8Deserializer
+from pyspark.serializers import write_int, write_with_length, UTF8Deserializer
 
 
 class TaskContext(object):
@@ -107,18 +108,28 @@ def resources(self):
 
 
 BARRIER_FUNCTION = 1
+ALL_GATHER_FUNCTION = 2
 
 
-def _load_from_socket(port, auth_secret):
+def _load_from_socket(port, auth_secret, function, all_gather_message=None):
     """
     Load data from a given socket, this is a blocking method thus only return when the socket
     connection has been closed.
     """
     (sockfile, sock) = local_connect_and_auth(port, auth_secret)
-    # The barrier() call may block forever, so no timeout
+
+    # The call may block forever, so no timeout
     sock.settimeout(None)
-    # Make a barrier() function call.
-    write_int(BARRIER_FUNCTION, sockfile)
+
+    if function == BARRIER_FUNCTION:
+        # Make a barrier() function call.
+        write_int(function, sockfile)
+    elif function == ALL_GATHER_FUNCTION:
+        # Make a all_gather() function call.
+        write_int(function, sockfile)
+        write_with_length(all_gather_message.encode("utf-8"), sockfile)
+    else:
+        raise ValueError("Unrecognized function type")
     sockfile.flush()
 
     # Collect result.
@@ -199,7 +210,33 @@ def barrier(self):
             raise Exception("Not supported to call barrier() before initialize " +
                             "BarrierTaskContext.")
         else:
-            _load_from_socket(self._port, self._secret)
+            _load_from_socket(self._port, self._secret, BARRIER_FUNCTION)
+
+    def allGather(self, message=""):
+        """
+        .. note:: Experimental
+
+        This function blocks until all tasks in the same stage have reached this routine.
+        Each task passes in a message and returns with a list of all the messages passed in
+        by each of those tasks.
+
+        .. warning:: In a barrier stage, each task much have the same number of `allGather()`
+            calls, in all possible code branches.
+            Otherwise, you may get the job hanging or a SparkException after timeout.
+        """
+        if not isinstance(message, str):
+            raise ValueError("Argument `message` must be of type `str`")
+        elif self._port is None or self._secret is None:
+            raise Exception("Not supported to call barrier() before initialize " +
+                            "BarrierTaskContext.")
+        else:
+            gathered_items = _load_from_socket(
+                self._port,
+                self._secret,
+                ALL_GATHER_FUNCTION,
+                message,
+            )
+            return [e for e in json.loads(gathered_items)]
 
     def getTaskInfos(self):
         """
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 68cfe814762e0..0053aadd9c3ed 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -135,6 +135,26 @@ def context_barrier(x):
         times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
         self.assertTrue(max(times) - min(times) < 1)
 
+    def test_all_gather(self):
+        """
+        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
+        within a stage and passes messages properly.
+        """
+        rdd = self.sc.parallelize(range(10), 4)
+
+        def f(iterator):
+            yield sum(iterator)
+
+        def context_barrier(x):
+            tc = BarrierTaskContext.get()
+            time.sleep(random.randint(1, 10))
+            out = tc.allGather(str(context.partitionId()))
+            pids = [int(e) for e in out]
+            return [pids]
+
+        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
+        self.assertTrue(pids == [0, 1, 2, 3])
+
     def test_barrier_infos(self):
         """
         Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the

From fa3517cdb163b0589dc02c7d1fefb65be811f65f Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Thu, 13 Feb 2020 17:43:55 -0800
Subject: [PATCH 042/185] Revert "[SPARK-30667][CORE] Add allGather method to
 BarrierTaskContext"

This reverts commit 57254c9719f9af9ad985596ed7fbbaafa4052002.
---
 .../org/apache/spark/BarrierCoordinator.scala | 113 ++-----------
 .../org/apache/spark/BarrierTaskContext.scala | 153 ++++++------------
 .../spark/api/python/PythonRunner.scala       |  51 ++----
 .../scheduler/BarrierTaskContextSuite.scala   |  74 ---------
 python/pyspark/taskcontext.py                 |  49 +-----
 python/pyspark/tests/test_taskcontext.py      |  20 ---
 6 files changed, 79 insertions(+), 381 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 042a2664a0e27..4e417679ca663 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,17 +17,12 @@
 
 package org.apache.spark
 
-import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.{compact, render}
-
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted}
@@ -104,15 +99,10 @@ private[spark] class BarrierCoordinator(
     // reset when a barrier() call fails due to timeout.
     private var barrierEpoch: Int = 0
 
-    // An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call
+    // An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier()
+    // call.
     private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks)
 
-    // An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call
-    private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer]
-
-    // The blocking requestMethod called by tasks to sync up for this stage attempt
-    private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER
-
     // A timer task that ensures we may timeout for a barrier() call.
     private var timerTask: TimerTask = null
 
@@ -140,32 +130,9 @@ private[spark] class BarrierCoordinator(
 
     // Process the global sync request. The barrier() call succeed if collected enough requests
     // within a configured time, otherwise fail all the pending requests.
-    def handleRequest(
-      requester: RpcCallContext,
-      request: RequestToSync
-    ): Unit = synchronized {
+    def handleRequest(requester: RpcCallContext, request: RequestToSync): Unit = synchronized {
       val taskId = request.taskAttemptId
       val epoch = request.barrierEpoch
-      val requestMethod = request.requestMethod
-      val partitionId = request.partitionId
-      val allGatherMessage = request match {
-        case ag: AllGatherRequestToSync => ag.allGatherMessage
-        case _ => ""
-      }
-
-      if (requesters.size == 0) {
-        requestMethodToSync = requestMethod
-      }
-
-      if (requestMethodToSync != requestMethod) {
-        requesters.foreach(
-          _.sendFailure(new SparkException(s"$barrierId tried to use requestMethod " +
-            s"`$requestMethod` during barrier epoch $barrierEpoch, which does not match " +
-            s"the current synchronized requestMethod `$requestMethodToSync`"
-          ))
-        )
-        cleanupBarrierStage(barrierId)
-      }
 
       // Require the number of tasks is correctly set from the BarrierTaskContext.
       require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " +
@@ -186,7 +153,6 @@ private[spark] class BarrierCoordinator(
         }
         // Add the requester to array of RPCCallContexts pending for reply.
         requesters += requester
-        allGatherMessages(partitionId) = allGatherMessage
         logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " +
           s"$taskId, current progress: ${requesters.size}/$numTasks.")
         if (maybeFinishAllRequesters(requesters, numTasks)) {
@@ -196,7 +162,6 @@ private[spark] class BarrierCoordinator(
             s"tasks, finished successfully.")
           barrierEpoch += 1
           requesters.clear()
-          allGatherMessages.clear()
           cancelTimerTask()
         }
       }
@@ -208,13 +173,7 @@ private[spark] class BarrierCoordinator(
         requesters: ArrayBuffer[RpcCallContext],
         numTasks: Int): Boolean = {
       if (requesters.size == numTasks) {
-        requestMethodToSync match {
-          case RequestMethod.BARRIER =>
-            requesters.foreach(_.reply(""))
-          case RequestMethod.ALL_GATHER =>
-            val json: String = compact(render(allGatherMessages))
-            requesters.foreach(_.reply(json))
-        }
+        requesters.foreach(_.reply(()))
         true
       } else {
         false
@@ -227,7 +186,6 @@ private[spark] class BarrierCoordinator(
       // messages come from current stage attempt shall fail.
       barrierEpoch = -1
       requesters.clear()
-      allGatherMessages.clear()
       cancelTimerTask()
     }
   }
@@ -241,11 +199,11 @@ private[spark] class BarrierCoordinator(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case request: RequestToSync =>
+    case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) =>
       // Get or init the ContextBarrierState correspond to the stage attempt.
-      val barrierId = ContextBarrierId(request.stageId, request.stageAttemptId)
+      val barrierId = ContextBarrierId(stageId, stageAttemptId)
       states.computeIfAbsent(barrierId,
-        (key: ContextBarrierId) => new ContextBarrierState(key, request.numTasks))
+        (key: ContextBarrierId) => new ContextBarrierState(key, numTasks))
       val barrierState = states.get(barrierId)
 
       barrierState.handleRequest(context, request)
@@ -258,16 +216,6 @@ private[spark] class BarrierCoordinator(
 
 private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
 
-private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
-  def numTasks: Int
-  def stageId: Int
-  def stageAttemptId: Int
-  def taskAttemptId: Long
-  def barrierEpoch: Int
-  def partitionId: Int
-  def requestMethod: RequestMethod.Value
-}
-
 /**
  * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is
  * identified by stageId + stageAttemptId + barrierEpoch.
@@ -276,44 +224,11 @@ private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
  * @param stageId ID of current stage
  * @param stageAttemptId ID of current stage attempt
  * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
- * @param partitionId ID of the current partition the task is assigned to
- * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls.
  */
-private[spark] case class BarrierRequestToSync(
-  numTasks: Int,
-  stageId: Int,
-  stageAttemptId: Int,
-  taskAttemptId: Long,
-  barrierEpoch: Int,
-  partitionId: Int,
-  requestMethod: RequestMethod.Value
-) extends RequestToSync
-
-/**
- * A global sync request message from BarrierTaskContext, by `allGather()` call. Each request is
- * identified by stageId + stageAttemptId + barrierEpoch.
- *
- * @param numTasks The number of global sync requests the BarrierCoordinator shall receive
- * @param stageId ID of current stage
- * @param stageAttemptId ID of current stage attempt
- * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
- * @param partitionId ID of the current partition the task is assigned to
- * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
- * @param allGatherMessage Message sent from the BarrierTaskContext if requestMethod is ALL_GATHER
- */
-private[spark] case class AllGatherRequestToSync(
-  numTasks: Int,
-  stageId: Int,
-  stageAttemptId: Int,
-  taskAttemptId: Long,
-  barrierEpoch: Int,
-  partitionId: Int,
-  requestMethod: RequestMethod.Value,
-  allGatherMessage: String
-) extends RequestToSync
-
-private[spark] object RequestMethod extends Enumeration {
-  val BARRIER, ALL_GATHER = Value
-}
+private[spark] case class RequestToSync(
+    numTasks: Int,
+    stageId: Int,
+    stageAttemptId: Int,
+    taskAttemptId: Long,
+    barrierEpoch: Int) extends BarrierCoordinatorMessage
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 2263538a11676..3d369802f3023 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,19 +17,11 @@
 
 package org.apache.spark
 
-import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Properties, Timer, TimerTask}
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
-import scala.language.postfixOps
-
-import org.json4s.DefaultFormats
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
@@ -67,31 +59,49 @@ class BarrierTaskContext private[spark] (
   // from different tasks within the same barrier stage attempt to succeed.
   private lazy val numTasks = getTaskInfos().size
 
-  private def getRequestToSync(
-    numTasks: Int,
-    stageId: Int,
-    stageAttemptNumber: Int,
-    taskAttemptId: Long,
-    barrierEpoch: Int,
-    partitionId: Int,
-    requestMethod: RequestMethod.Value,
-    allGatherMessage: String
-  ): RequestToSync = {
-    requestMethod match {
-      case RequestMethod.BARRIER =>
-        BarrierRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch, partitionId, requestMethod)
-      case RequestMethod.ALL_GATHER =>
-        AllGatherRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch, partitionId, requestMethod, allGatherMessage)
-    }
-  }
-
-  private def runBarrier(
-    requestMethod: RequestMethod.Value,
-    allGatherMessage: String = ""
-  ): String = {
-
+  /**
+   * :: Experimental ::
+   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
+   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
+   * stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
+   * timeout. Some examples of '''misuses''' are listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       if (context.partitionId() == 0) {
+   *           // Do nothing.
+   *       } else {
+   *           context.barrier()
+   *       }
+   *       iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       try {
+   *           // Do something that might throw an Exception.
+   *           doSomething()
+   *           context.barrier()
+   *       } catch {
+   *           case e: Exception => logWarning("...", e)
+   *       }
+   *       context.barrier()
+   *       iter
+   *   }
+   * }}}
+   */
+  @Experimental
+  @Since("2.4.0")
+  def barrier(): Unit = {
     logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " +
       s"the global sync, current barrier epoch is $barrierEpoch.")
     logTrace("Current callSite: " + Utils.getCallSite())
@@ -108,12 +118,10 @@ class BarrierTaskContext private[spark] (
     // Log the update of global sync every 60 seconds.
     timer.schedule(timerTask, 60000, 60000)
 
-    var json: String = ""
-
     try {
-      val abortableRpcFuture = barrierCoordinator.askAbortable[String](
-        message = getRequestToSync(numTasks, stageId, stageAttemptNumber,
-          taskAttemptId, barrierEpoch, partitionId, requestMethod, allGatherMessage),
+      val abortableRpcFuture = barrierCoordinator.askAbortable[Unit](
+        message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch),
         // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by
         // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework.
         timeout = new RpcTimeout(365.days, "barrierTimeout"))
@@ -125,7 +133,7 @@ class BarrierTaskContext private[spark] (
         while (!abortableRpcFuture.toFuture.isCompleted) {
           // wait RPC future for at most 1 second
           try {
-            json = ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
+            ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
           } catch {
             case _: TimeoutException | _: InterruptedException =>
               // If `TimeoutException` thrown, waiting RPC future reach 1 second.
@@ -155,73 +163,6 @@ class BarrierTaskContext private[spark] (
       timerTask.cancel()
       timer.purge()
     }
-    json
-  }
-
-  /**
-   * :: Experimental ::
-   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
-   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
-   * stage have reached this routine.
-   *
-   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
-   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
-   * timeout. Some examples of '''misuses''' are listed below:
-   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
-   * shall lead to timeout of the function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       if (context.partitionId() == 0) {
-   *           // Do nothing.
-   *       } else {
-   *           context.barrier()
-   *       }
-   *       iter
-   *   }
-   * }}}
-   *
-   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
-   * second function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       try {
-   *           // Do something that might throw an Exception.
-   *           doSomething()
-   *           context.barrier()
-   *       } catch {
-   *           case e: Exception => logWarning("...", e)
-   *       }
-   *       context.barrier()
-   *       iter
-   *   }
-   * }}}
-   */
-  @Experimental
-  @Since("2.4.0")
-  def barrier(): Unit = {
-    runBarrier(RequestMethod.BARRIER)
-    ()
-  }
-
-  /**
-   * :: Experimental ::
-   * Blocks until all tasks in the same stage have reached this routine. Each task passes in
-   * a message and returns with a list of all the messages passed in by each of those tasks.
-   *
-   * CAUTION! The allGather method requires the same precautions as the barrier method
-   *
-   * The message is type String rather than Array[Byte] because it is more convenient for
-   * the user at the cost of worse performance.
-   */
-  @Experimental
-  @Since("3.0.0")
-  def allGather(message: String): ArrayBuffer[String] = {
-    val json = runBarrier(RequestMethod.ALL_GATHER, message)
-    val jsonArray = parse(json)
-    implicit val formats = DefaultFormats
-    ArrayBuffer(jsonArray.extract[Array[String]]: _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index fa8bf0fc06358..658e0d593a167 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,13 +24,8 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.{compact, render}
-
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
@@ -243,18 +238,13 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
                   sock.setSoTimeout(10000)
                   authHelper.authClient(sock)
                   val input = new DataInputStream(sock.getInputStream())
-                  val requestMethod = input.readInt()
-                  // The BarrierTaskContext function may wait infinitely, socket shall not timeout
-                  // before the function finishes.
-                  sock.setSoTimeout(0)
-                  requestMethod match {
+                  input.readInt() match {
                     case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-                      barrierAndServe(requestMethod, sock)
-                    case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
-                      val length = input.readInt()
-                      val message = new Array[Byte](length)
-                      input.readFully(message)
-                      barrierAndServe(requestMethod, sock, new String(message, UTF_8))
+                      // The barrier() function may wait infinitely, socket shall not timeout
+                      // before the function finishes.
+                      sock.setSoTimeout(0)
+                      barrierAndServe(sock)
+
                     case _ =>
                       val out = new DataOutputStream(new BufferedOutputStream(
                         sock.getOutputStream))
@@ -405,31 +395,15 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     }
 
     /**
-     * Gateway to call BarrierTaskContext methods.
+     * Gateway to call BarrierTaskContext.barrier().
      */
-    def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = {
-      require(
-        serverSocket.isDefined,
-        "No available ServerSocket to redirect the BarrierTaskContext method call."
-      )
+    def barrierAndServe(sock: Socket): Unit = {
+      require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.")
+
       val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
       try {
-        var result: String = ""
-        requestMethod match {
-          case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-            context.asInstanceOf[BarrierTaskContext].barrier()
-            result = BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS
-          case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
-            val messages: ArrayBuffer[String] = context.asInstanceOf[BarrierTaskContext].allGather(
-              message
-            )
-            result = compact(render(JArray(
-              messages.map(
-                (message) => JString(message)
-              ).toList
-            )))
-        }
-        writeUTF(result, out)
+        context.asInstanceOf[BarrierTaskContext].barrier()
+        writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out)
       } catch {
         case e: SparkException =>
           writeUTF(e.getMessage, out)
@@ -664,7 +638,6 @@ private[spark] object SpecialLengths {
 
 private[spark] object BarrierTaskContextMessageProtocol {
   val BARRIER_FUNCTION = 1
-  val ALL_GATHER_FUNCTION = 2
   val BARRIER_RESULT_SUCCESS = "success"
   val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side."
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
index ed38b7f0ecac1..fc8ac38479932 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.scheduler
 
 import java.io.File
 
-import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 import org.apache.spark._
@@ -53,79 +52,6 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext {
     assert(times.max - times.min <= 1000)
   }
 
-  test("share messages with allGather() call") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      // Pass partitionId message in
-      val message = context.partitionId().toString
-      val messages = context.allGather(message)
-      messages.toList.iterator
-    }
-    // Take a sorted list of all the partitionId messages
-    val messages = rdd2.collect().head
-    // All the task partitionIds are shared
-    for((x, i) <- messages.view.zipWithIndex) assert(x == i.toString)
-  }
-
-  test("throw exception if we attempt to synchronize with different blocking calls") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      val partitionId = context.partitionId
-      if (partitionId == 0) {
-        context.barrier()
-      } else {
-        context.allGather(partitionId.toString)
-      }
-      Seq(null).iterator
-    }
-    val error = intercept[SparkException] {
-      rdd2.collect()
-    }.getMessage
-    assert(error.contains("does not match the current synchronized requestMethod"))
-  }
-
-  test("successively sync with allGather and barrier") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      context.barrier()
-      val time1 = System.currentTimeMillis()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      // Pass partitionId message in
-      val message = context.partitionId().toString
-      val messages = context.allGather(message)
-      val time2 = System.currentTimeMillis()
-      Seq((time1, time2)).iterator
-    }
-    val times = rdd2.collect()
-    // All the tasks shall finish the first round of global sync within a short time slot.
-    val times1 = times.map(_._1)
-    assert(times1.max - times1.min <= 1000)
-
-    // All the tasks shall finish the second round of global sync within a short time slot.
-    val times2 = times.map(_._2)
-    assert(times2.max - times2.min <= 1000)
-  }
-
   test("support multiple barrier() call within a single task") {
     initLocalClusterSparkContext()
     val rdd = sc.makeRDD(1 to 10, 4)
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index 90bd2345ac525..d648f63338514 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,10 +16,9 @@
 #
 
 from __future__ import print_function
-import json
 
 from pyspark.java_gateway import local_connect_and_auth
-from pyspark.serializers import write_int, write_with_length, UTF8Deserializer
+from pyspark.serializers import write_int, UTF8Deserializer
 
 
 class TaskContext(object):
@@ -108,28 +107,18 @@ def resources(self):
 
 
 BARRIER_FUNCTION = 1
-ALL_GATHER_FUNCTION = 2
 
 
-def _load_from_socket(port, auth_secret, function, all_gather_message=None):
+def _load_from_socket(port, auth_secret):
     """
     Load data from a given socket, this is a blocking method thus only return when the socket
     connection has been closed.
     """
     (sockfile, sock) = local_connect_and_auth(port, auth_secret)
-
-    # The call may block forever, so no timeout
+    # The barrier() call may block forever, so no timeout
     sock.settimeout(None)
-
-    if function == BARRIER_FUNCTION:
-        # Make a barrier() function call.
-        write_int(function, sockfile)
-    elif function == ALL_GATHER_FUNCTION:
-        # Make a all_gather() function call.
-        write_int(function, sockfile)
-        write_with_length(all_gather_message.encode("utf-8"), sockfile)
-    else:
-        raise ValueError("Unrecognized function type")
+    # Make a barrier() function call.
+    write_int(BARRIER_FUNCTION, sockfile)
     sockfile.flush()
 
     # Collect result.
@@ -210,33 +199,7 @@ def barrier(self):
             raise Exception("Not supported to call barrier() before initialize " +
                             "BarrierTaskContext.")
         else:
-            _load_from_socket(self._port, self._secret, BARRIER_FUNCTION)
-
-    def allGather(self, message=""):
-        """
-        .. note:: Experimental
-
-        This function blocks until all tasks in the same stage have reached this routine.
-        Each task passes in a message and returns with a list of all the messages passed in
-        by each of those tasks.
-
-        .. warning:: In a barrier stage, each task much have the same number of `allGather()`
-            calls, in all possible code branches.
-            Otherwise, you may get the job hanging or a SparkException after timeout.
-        """
-        if not isinstance(message, str):
-            raise ValueError("Argument `message` must be of type `str`")
-        elif self._port is None or self._secret is None:
-            raise Exception("Not supported to call barrier() before initialize " +
-                            "BarrierTaskContext.")
-        else:
-            gathered_items = _load_from_socket(
-                self._port,
-                self._secret,
-                ALL_GATHER_FUNCTION,
-                message,
-            )
-            return [e for e in json.loads(gathered_items)]
+            _load_from_socket(self._port, self._secret)
 
     def getTaskInfos(self):
         """
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 0053aadd9c3ed..68cfe814762e0 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -135,26 +135,6 @@ def context_barrier(x):
         times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
         self.assertTrue(max(times) - min(times) < 1)
 
-    def test_all_gather(self):
-        """
-        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
-        within a stage and passes messages properly.
-        """
-        rdd = self.sc.parallelize(range(10), 4)
-
-        def f(iterator):
-            yield sum(iterator)
-
-        def context_barrier(x):
-            tc = BarrierTaskContext.get()
-            time.sleep(random.randint(1, 10))
-            out = tc.allGather(str(context.partitionId()))
-            pids = [int(e) for e in out]
-            return [pids]
-
-        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
-        self.assertTrue(pids == [0, 1, 2, 3])
-
     def test_barrier_infos(self):
         """
         Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the

From 25db8c71a2100c167b8c2d7a6c540ebc61db9b73 Mon Sep 17 00:00:00 2001
From: David Toneian <david@toneian.com>
Date: Fri, 14 Feb 2020 11:00:35 +0900
Subject: [PATCH 043/185] [PYSPARK][DOCS][MINOR] Changed `:func:` to `:attr:`
 Sphinx roles, fixed links in documentation of
 `Data{Frame,Stream}{Reader,Writer}`

This commit is published into the public domain.

### What changes were proposed in this pull request?
This PR fixes the documentation of `DataFrameReader`, `DataFrameWriter`, `DataStreamReader`, and `DataStreamWriter`, where attributes of other classes were misrepresented as functions. Additionally, creation of hyperlinks across modules was fixed in these instances.

### Why are the changes needed?
The old state produced documentation that suggested invalid usage of PySpark objects (accessing attributes as though they were callable.)

### Does this PR introduce any user-facing change?
No, except for improved documentation.

### How was this patch tested?
No test added; documentation build runs through.

Closes #27553 from DavidToneian/docfix-DataFrameReader-DataFrameWriter-DataStreamReader-DataStreamWriter.

Authored-by: David Toneian <david@toneian.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/sql/readwriter.py |  4 ++--
 python/pyspark/sql/streaming.py  | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 3d3280dbd9943..69660395ad823 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -48,7 +48,7 @@ def _set_opts(self, schema=None, **options):
 class DataFrameReader(OptionUtils):
     """
     Interface used to load a :class:`DataFrame` from external storage systems
-    (e.g. file systems, key-value stores, etc). Use :func:`spark.read`
+    (e.g. file systems, key-value stores, etc). Use :attr:`SparkSession.read`
     to access this.
 
     .. versionadded:: 1.4
@@ -616,7 +616,7 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar
 class DataFrameWriter(OptionUtils):
     """
     Interface used to write a :class:`DataFrame` to external storage systems
-    (e.g. file systems, key-value stores, etc). Use :func:`DataFrame.write`
+    (e.g. file systems, key-value stores, etc). Use :attr:`DataFrame.write`
     to access this.
 
     .. versionadded:: 1.4
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index f17a52f6b3dc8..5fced8aca9bdf 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -276,9 +276,9 @@ def resetTerminated(self):
 
 class DataStreamReader(OptionUtils):
     """
-    Interface used to load a streaming :class:`DataFrame` from external storage systems
-    (e.g. file systems, key-value stores, etc). Use :func:`spark.readStream`
-    to access this.
+    Interface used to load a streaming :class:`DataFrame <pyspark.sql.DataFrame>` from external
+    storage systems (e.g. file systems, key-value stores, etc).
+    Use :attr:`SparkSession.readStream <pyspark.sql.SparkSession.readStream>` to access this.
 
     .. note:: Evolving.
 
@@ -750,8 +750,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
 
 class DataStreamWriter(object):
     """
-    Interface used to write a streaming :class:`DataFrame` to external storage systems
-    (e.g. file systems, key-value stores, etc). Use :func:`DataFrame.writeStream`
+    Interface used to write a streaming :class:`DataFrame <pyspark.sql.DataFrame>` to external
+    storage systems (e.g. file systems, key-value stores, etc).
+    Use :attr:`DataFrame.writeStream <pyspark.sql.DataFrame.writeStream>`
     to access this.
 
     .. note:: Evolving.

From 0aed77a0155b404e39bc5dbc0579e29e4c7bf887 Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Fri, 14 Feb 2020 11:20:55 +0800
Subject: [PATCH 044/185] [SPARK-30801][SQL] Subqueries should not be AQE-ed if
 main query is not

### What changes were proposed in this pull request?
This PR makes sure AQE is either enabled or disabled for the entire query, including the main query and all subqueries.
Currently there are unsupported queries by AQE, e.g., queries that contain DPP filters. We need to make sure that if the main query is unsupported, none of the sub-queries should apply AQE, otherwise it can lead to performance regressions due to missed opportunity of sub-query reuse.

### Why are the changes needed?
To get rid of potential perf regressions when AQE is turned on.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Updated DynamicPartitionPruningSuite:
1. Removed the existing workaround `withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")`
2. Added `DynamicPartitionPruningSuiteAEOn` and `DynamicPartitionPruningSuiteAEOff` to enable testing this suite with AQE on and off options
3. Added a check in `checkPartitionPruningPredicate` to verify that the subqueries are always in sync with the main query in terms of whether AQE is applied.

Closes #27554 from maryannxue/spark-30801.

Authored-by: maryannxue <maryannxue@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/QueryExecution.scala  | 19 ++++++++++---
 .../sql/DynamicPartitionPruningSuite.scala    | 27 ++++++++++++++++---
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 53b6b5d82c021..9109c05e75853 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -274,13 +274,25 @@ object QueryExecution {
    * are correct, insert whole stage code gen, and try to reduce the work done by reusing exchanges
    * and subqueries.
    */
-  private[execution] def preparations(sparkSession: SparkSession): Seq[Rule[SparkPlan]] =
+  private[execution] def preparations(sparkSession: SparkSession): Seq[Rule[SparkPlan]] = {
+
+    val sparkSessionWithAdaptiveExecutionOff =
+    if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) {
+      val session = sparkSession.cloneSession()
+      session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false)
+      session
+    } else {
+      sparkSession
+    }
+
     Seq(
       // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op
       // as the original plan is hidden behind `AdaptiveSparkPlanExec`.
       InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession)),
-      PlanDynamicPruningFilters(sparkSession),
-      PlanSubqueries(sparkSession),
+      // If the following rules apply, it means the main query is not AQE-ed, so we make sure the
+      // subqueries are not AQE-ed either.
+      PlanDynamicPruningFilters(sparkSessionWithAdaptiveExecutionOff),
+      PlanSubqueries(sparkSessionWithAdaptiveExecutionOff),
       EnsureRequirements(sparkSession.sessionState.conf),
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf,
         sparkSession.sessionState.columnarRules),
@@ -288,6 +300,7 @@ object QueryExecution {
       ReuseExchange(sparkSession.sessionState.conf),
       ReuseSubquery(sparkSession.sessionState.conf)
     )
+  }
 
   /**
    * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
index f7b51d6f4c8ef..baa9f5ecafc68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
@@ -22,7 +22,7 @@ import org.scalatest.GivenWhenThen
 import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Expression}
 import org.apache.spark.sql.catalyst.plans.ExistenceJoin
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec}
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingQueryWrapper}
@@ -33,7 +33,7 @@ import org.apache.spark.sql.test.SharedSparkSession
 /**
  * Test suite for the filtering ratio policy used to trigger dynamic partition pruning (DPP).
  */
-class DynamicPartitionPruningSuite
+abstract class DynamicPartitionPruningSuiteBase
     extends QueryTest
     with SharedSparkSession
     with GivenWhenThen
@@ -43,9 +43,14 @@ class DynamicPartitionPruningSuite
 
   import testImplicits._
 
+  val adaptiveExecutionOn: Boolean
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 
+    spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, adaptiveExecutionOn)
+    spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY, true)
+
     val factData = Seq[(Int, Int, Int, Int)](
       (1000, 1, 1, 10),
       (1010, 2, 1, 10),
@@ -153,6 +158,8 @@ class DynamicPartitionPruningSuite
       sql("DROP TABLE IF EXISTS fact_stats")
       sql("DROP TABLE IF EXISTS dim_stats")
     } finally {
+      spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED)
+      spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY)
       super.afterAll()
     }
   }
@@ -195,6 +202,11 @@ class DynamicPartitionPruningSuite
           fail(s"Invalid child node found in\n$s")
       }
     }
+
+    val isMainQueryAdaptive = plan.isInstanceOf[AdaptiveSparkPlanExec]
+    subqueriesAll(plan).filterNot(subqueryBroadcast.contains).foreach { s =>
+      assert(s.find(_.isInstanceOf[AdaptiveSparkPlanExec]).isDefined == isMainQueryAdaptive)
+    }
   }
 
   /**
@@ -1173,8 +1185,7 @@ class DynamicPartitionPruningSuite
   }
 
   test("join key with multiple references on the filtering plan") {
-    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true",
-      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") {
       // when enable AQE, the reusedExchange is inserted when executed.
       withTable("fact", "dim") {
         spark.range(100).select(
@@ -1270,3 +1281,11 @@ class DynamicPartitionPruningSuite
     }
   }
 }
+
+class DynamicPartitionPruningSuiteAEOff extends DynamicPartitionPruningSuiteBase {
+  override val adaptiveExecutionOn: Boolean = false
+}
+
+class DynamicPartitionPruningSuiteAEOn extends DynamicPartitionPruningSuiteBase {
+  override val adaptiveExecutionOn: Boolean = true
+}

From b2134ee73cfad4d78aaf8f0a2898011ac0308e48 Mon Sep 17 00:00:00 2001
From: David Toneian <david@toneian.com>
Date: Fri, 14 Feb 2020 13:49:11 +0900
Subject: [PATCH 045/185] [SPARK-30823][PYTHON][DOCS] Set `%PYTHONPATH%` when
 building PySpark documentation on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit is published into the public domain.

### What changes were proposed in this pull request?
In analogy to `python/docs/Makefile`, which has
> export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.8.1-src.zip)

on line 10, this PR adds
> set PYTHONPATH=..;..\lib\py4j-0.10.8.1-src.zip

to `make2.bat`.

Since there is no `realpath` in default installations of Windows, I left the relative paths unresolved. Per the instructions on how to build docs, `make.bat` is supposed to be run from `python/docs` as the working directory, so this should probably not cause issues (`%BUILDDIR%` is a relative path as well.)

### Why are the changes needed?
When building the PySpark documentation on Windows, by changing directory to `python/docs` and running `make.bat` (which runs `make2.bat`), the majority of the documentation may not be built if pyspark is not in the default `%PYTHONPATH%`. Sphinx then reports that `pyspark` (and possibly dependencies) cannot be imported.

If `pyspark` is in the default `%PYTHONPATH%`, I suppose it is that version of `pyspark` – as opposed to the version found above the `python/docs` directory – that is considered when building the documentation, which may result in documentation that does not correspond to the development version one is trying to build.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Manual tests on my Windows 10 machine. Additional tests with other environments very welcome!

Closes #27569 from DavidToneian/SPARK-30823.

Authored-by: David Toneian <david@toneian.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/docs/make2.bat | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/docs/make2.bat b/python/docs/make2.bat
index 05d22eb5cdd23..742df373166da 100644
--- a/python/docs/make2.bat
+++ b/python/docs/make2.bat
@@ -2,6 +2,8 @@
 
 REM Command file for Sphinx documentation
 
+set PYTHONPATH=..;..\lib\py4j-0.10.8.1-src.zip
+
 
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build

From 99b8136a86030411e6bcbd312f40eb2a901ab0f0 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 14 Feb 2020 16:52:28 +0800
Subject: [PATCH 046/185] [SPARK-25990][SQL] ScriptTransformation should handle
 different data types correctly

### What changes were proposed in this pull request?

We should convert Spark InternalRows to hive data via `HiveInspectors.wrapperFor`.

### Why are the changes needed?

We may hit below exception without this change:

```
[info]    org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, 192.168.1.6, executor driver): java.lang.ClassCastException: org.apache.spark.sql.types.Decimal cannot be cast to org.apache.hadoop.hive.common.type.HiveDecimal
[info]   	at org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector.getPrimitiveJavaObject(JavaHiveDecimalObjectInspector.java:55)
[info]   	at org.apache.hadoop.hive.serde2.lazy.LazyUtils.writePrimitiveUTF8(LazyUtils.java:321)
[info]   	at org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.serialize(LazySimpleSerDe.java:292)
[info]   	at org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.serializeField(LazySimpleSerDe.java:247)
[info]   	at org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.doSerialize(LazySimpleSerDe.java:231)
[info]   	at org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe.serialize(AbstractEncodingAwareSerDe.java:55)
[info]   	at org.apache.spark.sql.hive.execution.ScriptTransformationWriterThread.$anonfun$run$2(ScriptTransformationExec.scala:300)
[info]   	at org.apache.spark.sql.hive.execution.ScriptTransformationWriterThread.$anonfun$run$2$adapted(ScriptTransformationExec.scala:281)
[info]   	at scala.collection.Iterator.foreach(Iterator.scala:941)
[info]   	at scala.collection.Iterator.foreach$(Iterator.scala:941)
[info]   	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
[info]   	at org.apache.spark.sql.hive.execution.ScriptTransformationWriterThread.$anonfun$run$1(ScriptTransformationExec.scala:281)
[info]   	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
[info]   	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
[info]   	at org.apache.spark.sql.hive.execution.ScriptTransformationWriterThread.run(ScriptTransformationExec.scala:270)
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added new test. But please note that this test returns different result between Hive1.2 and Hive2.3 due to `HiveDecimal` or `SerDe` difference(don't know the root cause yet).

Closes #27556 from Ngone51/script_transform.

Lead-authored-by: yi.wu <yi.wu@databricks.com>
Co-authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/ScriptTransformationExec.scala  | 32 ++++++++-----
 sql/hive/src/test/resources/test_script.py    | 21 +++++++++
 .../execution/ScriptTransformationSuite.scala | 46 ++++++++++++++++++-
 3 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 sql/hive/src/test/resources/test_script.py

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
index e12f663304e7a..40f7b4e8db7c5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
@@ -94,9 +94,8 @@ case class ScriptTransformationExec(
       // This new thread will consume the ScriptTransformation's input rows and write them to the
       // external process. That process's output will be read by this current thread.
       val writerThread = new ScriptTransformationWriterThread(
-        inputIterator,
+        inputIterator.map(outputProjection),
         input.map(_.dataType),
-        outputProjection,
         inputSerde,
         inputSoi,
         ioschema,
@@ -249,16 +248,15 @@ case class ScriptTransformationExec(
 private class ScriptTransformationWriterThread(
     iter: Iterator[InternalRow],
     inputSchema: Seq[DataType],
-    outputProjection: Projection,
     @Nullable inputSerde: AbstractSerDe,
-    @Nullable inputSoi: ObjectInspector,
+    @Nullable inputSoi: StructObjectInspector,
     ioschema: HiveScriptIOSchema,
     outputStream: OutputStream,
     proc: Process,
     stderrBuffer: CircularBuffer,
     taskContext: TaskContext,
     conf: Configuration
-  ) extends Thread("Thread-ScriptTransformation-Feed") with Logging {
+  ) extends Thread("Thread-ScriptTransformation-Feed") with HiveInspectors with Logging {
 
   setDaemon(true)
 
@@ -278,8 +276,8 @@ private class ScriptTransformationWriterThread(
     var threwException: Boolean = true
     val len = inputSchema.length
     try {
-      iter.map(outputProjection).foreach { row =>
-        if (inputSerde == null) {
+      if (inputSerde == null) {
+        iter.foreach { row =>
           val data = if (len == 0) {
             ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES")
           } else {
@@ -295,10 +293,21 @@ private class ScriptTransformationWriterThread(
             sb.toString()
           }
           outputStream.write(data.getBytes(StandardCharsets.UTF_8))
-        } else {
-          val writable = inputSerde.serialize(
-            row.asInstanceOf[GenericInternalRow].values, inputSoi)
+        }
+      } else {
+        // Convert Spark InternalRows to hive data via `HiveInspectors.wrapperFor`.
+        val hiveData = new Array[Any](inputSchema.length)
+        val fieldOIs = inputSoi.getAllStructFieldRefs.asScala.map(_.getFieldObjectInspector).toArray
+        val wrappers = fieldOIs.zip(inputSchema).map { case (f, dt) => wrapperFor(f, dt) }
+
+        iter.foreach { row =>
+          var i = 0
+          while (i < fieldOIs.length) {
+            hiveData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i, inputSchema(i)))
+            i += 1
+          }
 
+          val writable = inputSerde.serialize(hiveData, inputSoi)
           if (scriptInputWriter != null) {
             scriptInputWriter.write(writable)
           } else {
@@ -374,14 +383,13 @@ case class HiveScriptIOSchema (
   val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k))
 
 
-  def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, ObjectInspector)] = {
+  def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, StructObjectInspector)] = {
     inputSerdeClass.map { serdeClass =>
       val (columns, columnTypes) = parseAttrs(input)
       val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps)
       val fieldObjectInspectors = columnTypes.map(toInspector)
       val objectInspector = ObjectInspectorFactory
         .getStandardStructObjectInspector(columns.asJava, fieldObjectInspectors.asJava)
-        .asInstanceOf[ObjectInspector]
       (serde, objectInspector)
     }
   }
diff --git a/sql/hive/src/test/resources/test_script.py b/sql/hive/src/test/resources/test_script.py
new file mode 100644
index 0000000000000..82ef7b38f0c1b
--- /dev/null
+++ b/sql/hive/src/test/resources/test_script.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+for line in sys.stdin:
+    (a, b, c, d, e) = line.split('\t')
+    sys.stdout.write('\t'.join([a, b, c, d, e]))
+    sys.stdout.flush()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index 80a50c18bcb93..7d01fc53a4099 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.sql.Timestamp
+
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.scalatest.Assertions._
 import org.scalatest.BeforeAndAfterEach
@@ -24,15 +26,18 @@ import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.{SparkException, TaskContext, TestUtils}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode}
+import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StringType
 
-class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton with
-  BeforeAndAfterEach {
+class ScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with TestHiveSingleton
+  with BeforeAndAfterEach {
   import spark.implicits._
 
   private val noSerdeIOSchema = HiveScriptIOSchema(
@@ -186,6 +191,43 @@ class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton wit
       rowsDf.select("name").collect())
     assert(uncaughtExceptionHandler.exception.isEmpty)
   }
+
+  test("SPARK-25990: TRANSFORM should handle different data types correctly") {
+    assume(TestUtils.testCommandAvailable("python"))
+    val scriptFilePath = getTestResourcePath("test_script.py")
+
+    withTempView("v") {
+      val df = Seq(
+        (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)),
+        (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)),
+        (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3))
+      ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18)
+      df.createTempView("v")
+
+      val query = sql(
+        s"""
+          |SELECT
+          |TRANSFORM(a, b, c, d, e)
+          |USING 'python $scriptFilePath' AS (a, b, c, d, e)
+          |FROM v
+        """.stripMargin)
+
+      // In Hive1.2, it does not do well on Decimal conversion. For example, in this case,
+      // it converts a decimal value's type from Decimal(38, 18) to Decimal(1, 0). So we need
+      // do extra cast here for Hive1.2. But in Hive2.3, it still keeps the original Decimal type.
+      val decimalToString: Column => Column = if (HiveUtils.isHive23) {
+        c => c.cast("string")
+      } else {
+        c => c.cast("decimal(1, 0)").cast("string")
+      }
+      checkAnswer(query, identity, df.select(
+        'a.cast("string"),
+        'b.cast("string"),
+        'c.cast("string"),
+        decimalToString('d),
+        'e.cast("string")).collect())
+    }
+  }
 }
 
 private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode {

From 2a270a731a3b1da9a0fb036d648dd522e5c4d5ad Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Fri, 14 Feb 2020 18:20:18 +0800
Subject: [PATCH 047/185] [SPARK-30810][SQL] Parses and convert a CSV Dataset
 having different column from 'value' in csv(dataset) API

### What changes were proposed in this pull request?

This PR fixes `DataFrameReader.csv(dataset: Dataset[String])` API to take a `Dataset[String]` originated from a column name different from `value`. This is a long-standing bug started from the very first place.

`CSVUtils.filterCommentAndEmpty` assumed the `Dataset[String]` to be originated with `value` column. This PR changes to use the first column name in the schema.

### Why are the changes needed?

For  `DataFrameReader.csv(dataset: Dataset[String])` to support any `Dataset[String]` as the signature indicates.

### Does this PR introduce any user-facing change?
Yes,

```scala
val ds = spark.range(2).selectExpr("concat('a,b,', id) AS text").as[String]
spark.read.option("header", true).option("inferSchema", true).csv(ds).show()
```

Before:

```
org.apache.spark.sql.AnalysisException: cannot resolve '`value`' given input columns: [text];;
'Filter (length(trim('value, None)) > 0)
+- Project [concat(a,b,, cast(id#0L as string)) AS text#2]
   +- Range (0, 2, step=1, splits=Some(2))
```

After:

```
+---+---+---+
|  a|  b|  0|
+---+---+---+
|  a|  b|  1|
+---+---+---+
```

### How was this patch tested?

Unittest was added.

Closes #27561 from HyukjinKwon/SPARK-30810.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/datasources/csv/CSVUtils.scala     | 7 ++++---
 .../spark/sql/execution/datasources/csv/CSVSuite.scala     | 7 +++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
index 21fabac472f4b..d8b52c503ad34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -33,11 +33,12 @@ object CSVUtils {
     // with the one below, `filterCommentAndEmpty` but execution path is different. One of them
     // might have to be removed in the near future if possible.
     import lines.sqlContext.implicits._
-    val nonEmptyLines = lines.filter(length(trim($"value")) > 0)
+    val aliased = lines.toDF("value")
+    val nonEmptyLines = aliased.filter(length(trim($"value")) > 0)
     if (options.isCommentSet) {
-      nonEmptyLines.filter(!$"value".startsWith(options.comment.toString))
+      nonEmptyLines.filter(!$"value".startsWith(options.comment.toString)).as[String]
     } else {
-      nonEmptyLines
+      nonEmptyLines.as[String]
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index b1105b4a63bba..0be0e1e3da3dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -2294,6 +2294,13 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
       }
     }
   }
+
+  test("SPARK-30810: parses and convert a CSV Dataset having different column from 'value'") {
+    val ds = spark.range(2).selectExpr("concat('a,b,', id) AS `a.text`").as[String]
+    val csv = spark.read.option("header", true).option("inferSchema", true).csv(ds)
+    assert(csv.schema.fieldNames === Seq("a", "b", "0"))
+    checkAnswer(csv, Row("a", "b", 1))
+  }
 }
 
 class CSVv1Suite extends CSVSuite {

From 7137a6d065edeaab97bf5bf49ffaca3d060a14fe Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Fri, 14 Feb 2020 22:16:57 +0800
Subject: [PATCH 048/185] [SPARK-30766][SQL] Fix the timestamp truncation to
 the `HOUR` and `DAY` levels

### What changes were proposed in this pull request?
In the PR, I propose to use Java 8 time API in timestamp truncations to the levels of `HOUR` and `DAY`. The problem is in the usage of `timeZone.getOffset(millis)` in days/hours truncations where the combined calendar (Julian + Gregorian) is used underneath.

### Why are the changes needed?
The change fix wrong truncations. For example, the following truncation to hours should print `0010-01-01 01:00:00` but it outputs wrong timestamp:
```scala
Seq("0010-01-01 01:02:03.123456").toDF()
    .select($"value".cast("timestamp").as("ts"))
    .select(date_trunc("HOUR", $"ts").cast("string"))
    .show(false)
+------------------------------------+
|CAST(date_trunc(HOUR, ts) AS STRING)|
+------------------------------------+
|0010-01-01 01:30:17                 |
+------------------------------------+
```

### Does this PR introduce any user-facing change?
Yes. After the changes, the result of the example above is:
```scala
+------------------------------------+
|CAST(date_trunc(HOUR, ts) AS STRING)|
+------------------------------------+
|0010-01-01 01:00:00                 |
+------------------------------------+
```

### How was this patch tested?
- Added new test to `DateFunctionsSuite`
- By `DateExpressionsSuite` and `DateTimeUtilsSuite`

Closes #27512 from MaxGekk/fix-trunc-old-timestamp.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/datetimeExpressions.scala     |  6 +--
 .../sql/catalyst/util/DateTimeUtils.scala     | 44 ++++++++++---------
 .../catalyst/util/DateTimeUtilsSuite.scala    | 39 ++++++++--------
 .../apache/spark/sql/DateFunctionsSuite.scala | 13 ++++++
 4 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index cf91489d8e6b7..adf7251256041 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -1690,15 +1690,15 @@ case class TruncTimestamp(
 
   override def eval(input: InternalRow): Any = {
     evalHelper(input, minLevel = MIN_LEVEL_OF_TIMESTAMP_TRUNC) { (t: Any, level: Int) =>
-      DateTimeUtils.truncTimestamp(t.asInstanceOf[Long], level, timeZone)
+      DateTimeUtils.truncTimestamp(t.asInstanceOf[Long], level, zoneId)
     }
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     codeGenHelper(ctx, ev, minLevel = MIN_LEVEL_OF_TIMESTAMP_TRUNC, true) {
       (date: String, fmt: String) =>
-        s"truncTimestamp($date, $fmt, $tz);"
+        s"truncTimestamp($date, $fmt, $zid);"
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 5976bcbb52fd7..dcc7337116777 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -713,32 +713,34 @@ object DateTimeUtils {
     }
   }
 
+  private def truncToUnit(t: SQLTimestamp, zoneId: ZoneId, unit: ChronoUnit): SQLTimestamp = {
+    val truncated = microsToInstant(t).atZone(zoneId).truncatedTo(unit)
+    instantToMicros(truncated.toInstant)
+  }
+
   /**
    * Returns the trunc date time from original date time and trunc level.
    * Trunc level should be generated using `parseTruncLevel()`, should be between 0 and 12.
    */
-  def truncTimestamp(t: SQLTimestamp, level: Int, timeZone: TimeZone): SQLTimestamp = {
-    if (level == TRUNC_TO_MICROSECOND) return t
-    var millis = MICROSECONDS.toMillis(t)
-    val truncated = level match {
-      case TRUNC_TO_MILLISECOND => millis
-      case TRUNC_TO_SECOND =>
-        millis - millis % MILLIS_PER_SECOND
-      case TRUNC_TO_MINUTE =>
-        millis - millis % MILLIS_PER_MINUTE
-      case TRUNC_TO_HOUR =>
-        val offset = timeZone.getOffset(millis)
-        millis += offset
-        millis - millis % MILLIS_PER_HOUR - offset
-      case TRUNC_TO_DAY =>
-        val offset = timeZone.getOffset(millis)
-        millis += offset
-        millis - millis % MILLIS_PER_DAY - offset
-      case _ => // Try to truncate date levels
-        val dDays = millisToDays(millis, timeZone.toZoneId)
-        daysToMillis(truncDate(dDays, level), timeZone.toZoneId)
+  def truncTimestamp(t: SQLTimestamp, level: Int, zoneId: ZoneId): SQLTimestamp = {
+    level match {
+      case TRUNC_TO_MICROSECOND => t
+      case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS)
+      case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS)
+      case _ =>
+        val millis = MICROSECONDS.toMillis(t)
+        val truncated = level match {
+          case TRUNC_TO_MILLISECOND => millis
+          case TRUNC_TO_SECOND =>
+            millis - millis % MILLIS_PER_SECOND
+          case TRUNC_TO_MINUTE =>
+            millis - millis % MILLIS_PER_MINUTE
+          case _ => // Try to truncate date levels
+            val dDays = millisToDays(millis, zoneId)
+            daysToMillis(truncDate(dDays, level), zoneId)
+        }
+        truncated * MICROS_PER_MILLIS
     }
-    truncated * MICROS_PER_MILLIS
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index cd0594c775a47..ff4d8a2457922 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -499,9 +499,9 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
         level: Int,
         expected: String,
         inputTS: SQLTimestamp,
-        timezone: TimeZone = DateTimeUtils.defaultTimeZone()): Unit = {
+        zoneId: ZoneId = defaultZoneId): Unit = {
       val truncated =
-        DateTimeUtils.truncTimestamp(inputTS, level, timezone)
+        DateTimeUtils.truncTimestamp(inputTS, level, zoneId)
       val expectedTS = toTimestamp(expected, defaultZoneId)
       assert(truncated === expectedTS.get)
     }
@@ -539,6 +539,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
 
     for (tz <- ALL_TIMEZONES) {
       withDefaultTimeZone(tz) {
+        val zid = tz.toZoneId
         val inputTS = DateTimeUtils.stringToTimestamp(
           UTF8String.fromString("2015-03-05T09:32:05.359"), defaultZoneId)
         val inputTS1 = DateTimeUtils.stringToTimestamp(
@@ -552,23 +553,23 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
         val inputTS5 = DateTimeUtils.stringToTimestamp(
           UTF8String.fromString("1999-03-29T01:02:03.456789"), defaultZoneId)
 
-        testTrunc(DateTimeUtils.TRUNC_TO_YEAR, "2015-01-01T00:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_MONTH, "2015-03-01T00:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_DAY, "2015-03-05T00:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T09:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:32:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:05", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-02T00:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS1.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS2.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS3.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-23T00:00:00", inputTS4.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS1.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", inputTS2.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_DECADE, "1990-01-01", inputTS5.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_CENTURY, "1901-01-01", inputTS5.get, tz)
-        testTrunc(DateTimeUtils.TRUNC_TO_MILLENNIUM, "2001-01-01", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_YEAR, "2015-01-01T00:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_MONTH, "2015-03-01T00:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_DAY, "2015-03-05T00:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T09:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:32:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:05", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-02T00:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS1.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS2.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS3.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-23T00:00:00", inputTS4.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS1.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", inputTS2.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_DECADE, "1990-01-01", inputTS5.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_CENTURY, "1901-01-01", inputTS5.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_MILLENNIUM, "2001-01-01", inputTS.get, zid)
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 41d53c959ef99..ba45b9f9b62df 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -856,4 +856,17 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
       TimeZone.setDefault(defaultTz)
     }
   }
+
+  test("SPARK-30766: date_trunc of old timestamps to hours and days") {
+    def checkTrunc(level: String, expected: String): Unit = {
+      val df = Seq("0010-01-01 01:02:03.123456")
+        .toDF()
+        .select($"value".cast("timestamp").as("ts"))
+        .select(date_trunc(level, $"ts").cast("string"))
+      checkAnswer(df, Row(expected))
+    }
+
+    checkTrunc("HOUR", "0010-01-01 01:00:00")
+    checkTrunc("DAY", "0010-01-01 00:00:00")
+  }
 }

From b343757b1bd5d0344b82f36aa4d65ed34f840606 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Fri, 14 Feb 2020 10:18:08 -0800
Subject: [PATCH 049/185] [SPARK-29748][DOCS][FOLLOW-UP] Add a note that the
 legacy environment variable to set in both executor and driver

### What changes were proposed in this pull request?

This PR address the comment at https://github.com/apache/spark/pull/26496#discussion_r379194091 and improves the migration guide to explicitly note that the legacy environment variable to set in both executor and driver.

### Why are the changes needed?

To clarify this env should be set both in driver and executors.

### Does this PR introduce any user-facing change?

Nope.

### How was this patch tested?

I checked it via md editor.

Closes #27573 from HyukjinKwon/SPARK-29748.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 docs/pyspark-migration-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md
index 8ea4fec75edf8..f7f20389aa694 100644
--- a/docs/pyspark-migration-guide.md
+++ b/docs/pyspark-migration-guide.md
@@ -87,7 +87,7 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide.
   - Since Spark 3.0, `Column.getItem` is fixed such that it does not call `Column.apply`. Consequently, if `Column` is used as an argument to `getItem`, the indexing operator should be used.
     For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`.
 
-  - As of Spark 3.0 `Row` field names are no longer sorted alphabetically when constructing with named arguments for Python versions 3.6 and above, and the order of fields will match that as entered. To enable sorted fields by default, as in Spark 2.4, set the environment variable `PYSPARK_ROW_FIELD_SORTING_ENABLED` to "true". For Python versions less than 3.6, the field names will be sorted alphabetically as the only option.
+  - As of Spark 3.0 `Row` field names are no longer sorted alphabetically when constructing with named arguments for Python versions 3.6 and above, and the order of fields will match that as entered. To enable sorted fields by default, as in Spark 2.4, set the environment variable `PYSPARK_ROW_FIELD_SORTING_ENABLED` to "true" for both executors and driver - this environment variable must be consistent on all executors and driver; otherwise, it may cause failures or incorrect answers. For Python versions less than 3.6, the field names will be sorted alphabetically as the only option.
 
 ## Upgrading from PySpark 2.3 to 2.4
 

From d273a2bb0fac452a97f5670edd69d3e452e3e57e Mon Sep 17 00:00:00 2001
From: Holden Karau <hkarau@apple.com>
Date: Fri, 14 Feb 2020 12:36:52 -0800
Subject: [PATCH 050/185] [SPARK-20628][CORE][K8S] Start to improve Spark
 decommissioning & preemption support

This PR is based on an existing/previou PR - https://github.com/apache/spark/pull/19045

### What changes were proposed in this pull request?

This changes adds a decommissioning state that we can enter when the cloud provider/scheduler lets us know we aren't going to be removed immediately but instead will be removed soon. This concept fits nicely in K8s and also with spot-instances on AWS / preemptible instances all of which we can get a notice that our host is going away. For now we simply stop scheduling jobs, in the future we could perform some kind of migration of data during scale-down, or at least stop accepting new blocks to cache.

There is a design document at https://docs.google.com/document/d/1xVO1b6KAwdUhjEJBolVPl9C6sLj7oOveErwDSYdT-pE/edit?usp=sharing

### Why are the changes needed?

With more move to preemptible multi-tenancy, serverless environments, and spot-instances better handling of node scale down is required.

### Does this PR introduce any user-facing change?

There is no API change, however an additional configuration flag is added to enable/disable this behaviour.

### How was this patch tested?

New integration tests in the Spark K8s integration testing. Extension of the AppClientSuite to test decommissioning seperate from the K8s.

Closes #26440 from holdenk/SPARK-20628-keep-track-of-nodes-which-are-going-to-be-shutdown-r4.

Lead-authored-by: Holden Karau <hkarau@apple.com>
Co-authored-by: Holden Karau <holden@pigscanfly.ca>
Signed-off-by: Holden Karau <hkarau@apple.com>
---
 .../apache/spark/deploy/DeployMessage.scala   |  11 ++
 .../apache/spark/deploy/ExecutorState.scala   |   8 +-
 .../deploy/client/StandaloneAppClient.scala   |   2 +
 .../client/StandaloneAppClientListener.scala  |   2 +
 .../apache/spark/deploy/master/Master.scala   |  31 +++++
 .../apache/spark/deploy/worker/Worker.scala   |  26 ++++
 .../CoarseGrainedExecutorBackend.scala        |  39 +++++-
 .../org/apache/spark/executor/Executor.scala  |  16 +++
 .../apache/spark/internal/config/Worker.scala |   5 +
 .../main/scala/org/apache/spark/rdd/RDD.scala |   2 +
 .../spark/scheduler/ExecutorLossReason.scala  |   8 ++
 .../org/apache/spark/scheduler/Pool.scala     |   4 +
 .../apache/spark/scheduler/Schedulable.scala  |   1 +
 .../spark/scheduler/SchedulerBackend.scala    |   3 +
 .../spark/scheduler/TaskScheduler.scala       |   5 +
 .../spark/scheduler/TaskSchedulerImpl.scala   |   5 +
 .../spark/scheduler/TaskSetManager.scala      |   6 +
 .../cluster/CoarseGrainedClusterMessage.scala |   2 +
 .../CoarseGrainedSchedulerBackend.scala       |  66 +++++++++-
 .../cluster/StandaloneSchedulerBackend.scala  |   6 +
 .../org/apache/spark/util/SignalUtils.scala   |   2 +-
 .../spark/deploy/client/AppClientSuite.scala  |  39 +++++-
 .../spark/scheduler/DAGSchedulerSuite.scala   |   2 +
 .../ExternalClusterManagerSuite.scala         |   1 +
 .../scheduler/WorkerDecommissionSuite.scala   |  84 +++++++++++++
 .../spark/deploy/k8s/KubernetesConf.scala     |   3 +
 .../features/BasicExecutorFeatureStep.scala   |  20 ++-
 .../src/main/dockerfiles/spark/Dockerfile     |   4 +-
 .../src/main/dockerfiles/spark/decom.sh       |  35 ++++++
 .../src/main/dockerfiles/spark/entrypoint.sh  |   6 +-
 .../dev/dev-run-integration-tests.sh          |   9 +-
 .../integrationtest/DecommissionSuite.scala   |  49 ++++++++
 .../k8s/integrationtest/KubernetesSuite.scala | 117 ++++++++++++++----
 .../tests/decommissioning.py                  |  45 +++++++
 sbin/decommission-slave.sh                    |  57 +++++++++
 sbin/spark-daemon.sh                          |  15 +++
 36 files changed, 690 insertions(+), 46 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala
 create mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/tests/decommissioning.py
 create mode 100644 sbin/decommission-slave.sh

diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index fba371dcfb761..18305ad3746a6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -60,6 +60,15 @@ private[deploy] object DeployMessages {
     assert (port > 0)
   }
 
+  /**
+   * @param id the worker id
+   * @param worker the worker endpoint ref
+   */
+  case class WorkerDecommission(
+      id: String,
+      worker: RpcEndpointRef)
+    extends DeployMessage
+
   case class ExecutorStateChanged(
       appId: String,
       execId: Int,
@@ -149,6 +158,8 @@ private[deploy] object DeployMessages {
 
   case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
 
+  case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future.
+
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef)
diff --git a/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala b/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala
index 69c98e28931d7..0751bcf221f86 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala
@@ -19,9 +19,13 @@ package org.apache.spark.deploy
 
 private[deploy] object ExecutorState extends Enumeration {
 
-  val LAUNCHING, RUNNING, KILLED, FAILED, LOST, EXITED = Value
+  val LAUNCHING, RUNNING, KILLED, FAILED, LOST, EXITED, DECOMMISSIONED = Value
 
   type ExecutorState = Value
 
-  def isFinished(state: ExecutorState): Boolean = Seq(KILLED, FAILED, LOST, EXITED).contains(state)
+  // DECOMMISSIONED isn't listed as finished since we don't want to remove the executor from
+  // the worker and the executor still exists - but we do want to avoid scheduling new tasks on it.
+  private val finishedStates = Seq(KILLED, FAILED, LOST, EXITED)
+
+  def isFinished(state: ExecutorState): Boolean = finishedStates.contains(state)
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
index 8f17159228f8b..eedf5e969e291 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
@@ -180,6 +180,8 @@ private[spark] class StandaloneAppClient(
         logInfo("Executor updated: %s is now %s%s".format(fullId, state, messageText))
         if (ExecutorState.isFinished(state)) {
           listener.executorRemoved(fullId, message.getOrElse(""), exitStatus, workerLost)
+        } else if (state == ExecutorState.DECOMMISSIONED) {
+          listener.executorDecommissioned(fullId, message.getOrElse(""))
         }
 
       case WorkerRemoved(id, host, message) =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala
index d8bc1a883def1..2e38a6847891d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala
@@ -39,5 +39,7 @@ private[spark] trait StandaloneAppClientListener {
   def executorRemoved(
       fullId: String, message: String, exitStatus: Option[Int], workerLost: Boolean): Unit
 
+  def executorDecommissioned(fullId: String, message: String): Unit
+
   def workerRemoved(workerId: String, host: String, message: String): Unit
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 8d3795cae707a..71df5dfa423a9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -243,6 +243,15 @@ private[deploy] class Master(
       logError("Leadership has been revoked -- master shutting down.")
       System.exit(0)
 
+    case WorkerDecommission(id, workerRef) =>
+      logInfo("Recording worker %s decommissioning".format(id))
+      if (state == RecoveryState.STANDBY) {
+        workerRef.send(MasterInStandby)
+      } else {
+        // We use foreach since get gives us an option and we can skip the failures.
+        idToWorker.get(id).foreach(decommissionWorker)
+      }
+
     case RegisterWorker(
       id, workerHost, workerPort, workerRef, cores, memory, workerWebUiUrl,
       masterAddress, resources) =>
@@ -313,7 +322,9 @@ private[deploy] class Master(
             // Only retry certain number of times so we don't go into an infinite loop.
             // Important note: this code path is not exercised by tests, so be very careful when
             // changing this `if` condition.
+            // We also don't count failures from decommissioned workers since they are "expected."
             if (!normalExit
+                && oldState != ExecutorState.DECOMMISSIONED
                 && appInfo.incrementRetryCount() >= maxExecutorRetries
                 && maxExecutorRetries >= 0) { // < 0 disables this application-killing path
               val execs = appInfo.executors.values
@@ -850,6 +861,26 @@ private[deploy] class Master(
     true
   }
 
+  private def decommissionWorker(worker: WorkerInfo): Unit = {
+    if (worker.state != WorkerState.DECOMMISSIONED) {
+      logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port))
+      worker.setState(WorkerState.DECOMMISSIONED)
+      for (exec <- worker.executors.values) {
+        logInfo("Telling app of decommission executors")
+        exec.application.driver.send(ExecutorUpdated(
+          exec.id, ExecutorState.DECOMMISSIONED,
+          Some("worker decommissioned"), None, workerLost = false))
+        exec.state = ExecutorState.DECOMMISSIONED
+        exec.application.removeExecutor(exec)
+      }
+      // On recovery do not add a decommissioned executor
+      persistenceEngine.removeWorker(worker)
+    } else {
+      logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned".
+        format(worker.id, worker.host, worker.port))
+    }
+  }
+
   private def removeWorker(worker: WorkerInfo, msg: String): Unit = {
     logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
     worker.setState(WorkerState.DEAD)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 4be495ac4f13f..d988bcedb47f0 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -67,6 +67,14 @@ private[deploy] class Worker(
   Utils.checkHost(host)
   assert (port > 0)
 
+  // If worker decommissioning is enabled register a handler on PWR to shutdown.
+  if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+    logInfo("Registering SIGPWR handler to trigger decommissioning.")
+    SignalUtils.register("PWR")(decommissionSelf)
+  } else {
+    logInfo("Worker decommissioning not enabled, SIGPWR will result in exiting.")
+  }
+
   // A scheduled executor used to send messages at the specified time.
   private val forwardMessageScheduler =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("worker-forward-message-scheduler")
@@ -128,6 +136,7 @@ private[deploy] class Worker(
   private val workerUri = RpcEndpointAddress(rpcEnv.address, endpointName).toString
   private var registered = false
   private var connected = false
+  private var decommissioned = false
   private val workerId = generateWorkerId()
   private val sparkHome =
     if (sys.props.contains(IS_TESTING.key)) {
@@ -549,6 +558,8 @@ private[deploy] class Worker(
     case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_, resources_) =>
       if (masterUrl != activeMasterUrl) {
         logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
+      } else if (decommissioned) {
+        logWarning("Asked to launch an executor while decommissioned. Not launching executor.")
       } else {
         try {
           logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
@@ -672,6 +683,9 @@ private[deploy] class Worker(
     case ApplicationFinished(id) =>
       finishedApps += id
       maybeCleanupApplication(id)
+
+    case DecommissionSelf =>
+      decommissionSelf()
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
@@ -771,6 +785,18 @@ private[deploy] class Worker(
     }
   }
 
+  private[deploy] def decommissionSelf(): Boolean = {
+    if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+      logDebug("Decommissioning self")
+      decommissioned = true
+      sendToMaster(WorkerDecommission(workerId, self))
+    } else {
+      logWarning("Asked to decommission self, but decommissioning not enabled")
+    }
+    // Return true since can be called as a signal handler
+    true
+  }
+
   private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = {
     val driverId = driverStateChanged.driverId
     val exception = driverStateChanged.exception
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 25c5b9812fa1a..faf03a64ae8b2 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -43,7 +43,7 @@ import org.apache.spark.rpc._
 import org.apache.spark.scheduler.{ExecutorLossReason, TaskDescription}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.serializer.SerializerInstance
-import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, ThreadUtils, Utils}
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, SignalUtils, ThreadUtils, Utils}
 
 private[spark] class CoarseGrainedExecutorBackend(
     override val rpcEnv: RpcEnv,
@@ -64,6 +64,7 @@ private[spark] class CoarseGrainedExecutorBackend(
 
   private[this] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
+  @volatile private var decommissioned = false
   @volatile var driver: Option[RpcEndpointRef] = None
 
   // If this CoarseGrainedExecutorBackend is changed to support multiple threads, then this may need
@@ -80,6 +81,9 @@ private[spark] class CoarseGrainedExecutorBackend(
   private[executor] val taskResources = new mutable.HashMap[Long, Map[String, ResourceInformation]]
 
   override def onStart(): Unit = {
+    logInfo("Registering PWR handler.")
+    SignalUtils.register("PWR")(decommissionSelf)
+
     logInfo("Connecting to driver: " + driverUrl)
     try {
       _resources = parseOrFindResources(resourcesFileOpt)
@@ -160,6 +164,16 @@ private[spark] class CoarseGrainedExecutorBackend(
       if (executor == null) {
         exitExecutor(1, "Received LaunchTask command but executor was null")
       } else {
+        if (decommissioned) {
+          logError("Asked to launch a task while decommissioned.")
+          driver match {
+            case Some(endpoint) =>
+              logInfo("Sending DecommissionExecutor to driver.")
+              endpoint.send(DecommissionExecutor(executorId))
+            case _ =>
+              logError("No registered driver to send Decommission to.")
+          }
+        }
         val taskDesc = TaskDescription.decode(data.value)
         logInfo("Got assigned task " + taskDesc.taskId)
         taskResources(taskDesc.taskId) = taskDesc.resources
@@ -242,6 +256,29 @@ private[spark] class CoarseGrainedExecutorBackend(
 
     System.exit(code)
   }
+
+  private def decommissionSelf(): Boolean = {
+    logInfo("Decommissioning self w/sync")
+    try {
+      decommissioned = true
+      // Tell master we are are decommissioned so it stops trying to schedule us
+      if (driver.nonEmpty) {
+        driver.get.askSync[Boolean](DecommissionExecutor(executorId))
+      } else {
+        logError("No driver to message decommissioning.")
+      }
+      if (executor != null) {
+        executor.decommission()
+      }
+      logInfo("Done decommissioning self.")
+      // Return true since we are handling a signal
+      true
+    } catch {
+      case e: Exception =>
+        logError(s"Error ${e} during attempt to decommission self")
+        false
+    }
+  }
 }
 
 private[spark] object CoarseGrainedExecutorBackend extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 8aeb16fe5d8c8..2bfa1cea4b26f 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -216,16 +216,32 @@ private[spark] class Executor(
    */
   private var heartbeatFailures = 0
 
+  /**
+   * Flag to prevent launching new tasks while decommissioned. There could be a race condition
+   * accessing this, but decommissioning is only intended to help not be a hard stop.
+   */
+  private var decommissioned = false
+
   heartbeater.start()
 
   metricsPoller.start()
 
   private[executor] def numRunningTasks: Int = runningTasks.size()
 
+  /**
+   * Mark an executor for decommissioning and avoid launching new tasks.
+   */
+  private[spark] def decommission(): Unit = {
+    decommissioned = true
+  }
+
   def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {
     val tr = new TaskRunner(context, taskDescription)
     runningTasks.put(taskDescription.taskId, tr)
     threadPool.execute(tr)
+    if (decommissioned) {
+      log.error(s"Launching a task while in decommissioned state.")
+    }
   }
 
   def killTask(taskId: Long, interruptThread: Boolean, reason: String): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala
index f1eaae29f18df..2b175c1e14ee5 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala
@@ -71,4 +71,9 @@ private[spark] object Worker {
     ConfigBuilder("spark.worker.ui.compressedLogFileLengthCacheSize")
     .intConf
     .createWithDefault(100)
+
+  private[spark] val WORKER_DECOMMISSION_ENABLED =
+    ConfigBuilder("spark.worker.decommission.enabled")
+      .booleanConf
+      .createWithDefault(false)
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 64d2032a12721..a26b5791fa08b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -361,6 +361,7 @@ abstract class RDD[T: ClassTag](
       readCachedBlock = false
       computeOrReadCheckpoint(partition, context)
     }) match {
+      // Block hit.
       case Left(blockResult) =>
         if (readCachedBlock) {
           val existingMetrics = context.taskMetrics().inputMetrics
@@ -374,6 +375,7 @@ abstract class RDD[T: ClassTag](
         } else {
           new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])
         }
+      // Need to compute the block.
       case Right(iter) =>
         new InterruptibleIterator(context, iter.asInstanceOf[Iterator[T]])
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
index 46a35b6a2eaf9..ee31093ec0652 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -58,3 +58,11 @@ private [spark] object LossReasonPending extends ExecutorLossReason("Pending los
 private[spark]
 case class SlaveLost(_message: String = "Slave lost", workerLost: Boolean = false)
   extends ExecutorLossReason(_message)
+
+/**
+ * A loss reason that means the executor is marked for decommissioning.
+ *
+ * This is used by the task scheduler to remove state associated with the executor, but
+ * not yet fail any tasks that were running in the executor before the executor is "fully" lost.
+ */
+private [spark] object ExecutorDecommission extends ExecutorLossReason("Executor decommission.")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
index 80805df256a15..2e2851eb9070b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
@@ -88,6 +88,10 @@ private[spark] class Pool(
     schedulableQueue.asScala.foreach(_.executorLost(executorId, host, reason))
   }
 
+  override def executorDecommission(executorId: String): Unit = {
+    schedulableQueue.asScala.foreach(_.executorDecommission(executorId))
+  }
+
   override def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean = {
     var shouldRevive = false
     for (schedulable <- schedulableQueue.asScala) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
index b6f88ed0a93aa..8cc239c81d11a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
@@ -43,6 +43,7 @@ private[spark] trait Schedulable {
   def removeSchedulable(schedulable: Schedulable): Unit
   def getSchedulableByName(name: String): Schedulable
   def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
+  def executorDecommission(executorId: String): Unit
   def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean
   def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
index 9159d2a0158d5..4752353046c19 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
@@ -27,6 +27,9 @@ private[spark] trait SchedulerBackend {
 
   def start(): Unit
   def stop(): Unit
+  /**
+   * Update the current offers and schedule tasks
+   */
   def reviveOffers(): Unit
   def defaultParallelism(): Int
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
index 15f5d20e9be75..e9e638a3645ac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -98,6 +98,11 @@ private[spark] trait TaskScheduler {
    */
   def applicationId(): String = appId
 
+  /**
+   * Process a decommissioning executor.
+   */
+  def executorDecommission(executorId: String): Unit
+
   /**
    * Process a lost executor
    */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index bf92081d13907..1b197c4cca53e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -734,6 +734,11 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  override def executorDecommission(executorId: String): Unit = {
+    rootPool.executorDecommission(executorId)
+    backend.reviveOffers()
+  }
+
   override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {
     var failedExecutor: Option[String] = None
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 2ce11347ade39..18684ee8ebbc2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -1083,6 +1083,12 @@ private[spark] class TaskSetManager(
     levels.toArray
   }
 
+  def executorDecommission(execId: String): Unit = {
+    recomputeLocality()
+    // Future consideration: if an executor is decommissioned it may make sense to add the current
+    // tasks to the spec exec queue.
+  }
+
   def recomputeLocality(): Unit = {
     // A zombie TaskSetManager may reach here while executorLost happens
     if (isZombie) return
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 283390814a6c0..8db0122f17ab4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -94,6 +94,8 @@ private[spark] object CoarseGrainedClusterMessages {
   case class RemoveExecutor(executorId: String, reason: ExecutorLossReason)
     extends CoarseGrainedClusterMessage
 
+  case class DecommissionExecutor(executorId: String)  extends CoarseGrainedClusterMessage
+
   case class RemoveWorker(workerId: String, host: String, message: String)
     extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 63aa04986b073..6e1efdaf5beb2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -92,6 +92,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // Executors that have been lost, but for which we don't yet know the real exit reason.
   private val executorsPendingLossReason = new HashSet[String]
 
+  // Executors which are being decommissioned
+  protected val executorsPendingDecommission = new HashSet[String]
+
   // A map of ResourceProfile id to map of hostname with its possible task number running on it
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
   protected var rpHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty
@@ -185,11 +188,20 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))
         removeExecutor(executorId, reason)
 
+      case DecommissionExecutor(executorId) =>
+        logError(s"Received decommission executor message ${executorId}.")
+        decommissionExecutor(executorId)
+
+      case RemoveWorker(workerId, host, message) =>
+        removeWorker(workerId, host, message)
+
       case LaunchedExecutor(executorId) =>
         executorDataMap.get(executorId).foreach { data =>
           data.freeCores = data.totalCores
         }
         makeOffers(executorId)
+      case e =>
+        logError(s"Received unexpected message. ${e}")
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
@@ -257,6 +269,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         removeWorker(workerId, host, message)
         context.reply(true)
 
+      case DecommissionExecutor(executorId) =>
+        logError(s"Received decommission executor message ${executorId}.")
+        decommissionExecutor(executorId)
+        context.reply(true)
+
       case RetrieveSparkAppConfig(resourceProfileId) =>
         val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(resourceProfileId)
         val reply = SparkAppConfig(
@@ -265,6 +282,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           Option(delegationTokens.get()),
           rp)
         context.reply(reply)
+      case e =>
+        logError(s"Received unexpected ask ${e}")
     }
 
     // Make fake resource offers on all executors
@@ -365,6 +384,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
             addressToExecutorId -= executorInfo.executorAddress
             executorDataMap -= executorId
             executorsPendingLossReason -= executorId
+            executorsPendingDecommission -= executorId
             executorsPendingToRemove.remove(executorId).getOrElse(false)
           }
           totalCoreCount.addAndGet(-executorInfo.totalCores)
@@ -389,6 +409,35 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       scheduler.workerRemoved(workerId, host, message)
     }
 
+    /**
+     * Mark a given executor as decommissioned and stop making resource offers for it.
+     */
+    private def decommissionExecutor(executorId: String): Boolean = {
+      val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized {
+        // Only bother decommissioning executors which are alive.
+        if (isExecutorActive(executorId)) {
+          executorsPendingDecommission += executorId
+          true
+        } else {
+          false
+        }
+      }
+
+      if (shouldDisable) {
+        logInfo(s"Starting decommissioning executor $executorId.")
+        try {
+          scheduler.executorDecommission(executorId)
+        } catch {
+          case e: Exception =>
+            logError(s"Unexpected error during decommissioning ${e.toString}", e)
+        }
+        logInfo(s"Finished decommissioning executor $executorId.")
+      } else {
+        logInfo(s"Skipping decommissioning of executor $executorId.")
+      }
+      shouldDisable
+    }
+
     /**
      * Stop making resource offers for the given executor. The executor is marked as lost with
      * the loss reason still pending.
@@ -511,8 +560,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   }
 
   protected def removeWorker(workerId: String, host: String, message: String): Unit = {
-    driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).failed.foreach(t =>
-      logError(t.getMessage, t))(ThreadUtils.sameThread)
+    driverEndpoint.send(RemoveWorker(workerId, host, message))
+  }
+
+  /**
+   * Called by subclasses when notified of a decommissioning executor.
+   */
+  private[spark] def decommissionExecutor(executorId: String): Unit = {
+    if (driverEndpoint != null) {
+      logInfo("Propegating executor decommission to driver.")
+      driverEndpoint.send(DecommissionExecutor(executorId))
+    }
   }
 
   def sufficientResourcesRegistered(): Boolean = true
@@ -543,7 +601,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   override def isExecutorActive(id: String): Boolean = synchronized {
     executorDataMap.contains(id) &&
       !executorsPendingToRemove.contains(id) &&
-      !executorsPendingLossReason.contains(id)
+      !executorsPendingLossReason.contains(id) &&
+      !executorsPendingDecommission.contains(id)
+
   }
 
   override def maxNumConcurrentTasks(): Int = synchronized {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index d91d78b29f98d..42c46464d79e1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -174,6 +174,12 @@ private[spark] class StandaloneSchedulerBackend(
     removeExecutor(fullId.split("/")(1), reason)
   }
 
+  override def executorDecommissioned(fullId: String, message: String) {
+    logInfo("Asked to decommission executor")
+    decommissionExecutor(fullId.split("/")(1))
+    logInfo("Executor %s decommissioned: %s".format(fullId, message))
+  }
+
   override def workerRemoved(workerId: String, host: String, message: String): Unit = {
     logInfo("Worker %s removed: %s".format(workerId, message))
     removeWorker(workerId, host, message)
diff --git a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala
index 5a24965170cef..230195da2a121 100644
--- a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala
@@ -60,7 +60,7 @@ private[spark] object SignalUtils extends Logging {
     if (SystemUtils.IS_OS_UNIX) {
       try {
         val handler = handlers.getOrElseUpdate(signal, {
-          logInfo("Registered signal handler for " + signal)
+          logInfo("Registering signal handler for " + signal)
           new ActionHandler(new Signal(signal))
         })
         handler.register(action)
diff --git a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
index a1d3077b8fc87..a3e39d7f53728 100644
--- a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
 import org.apache.spark.deploy.master.{ApplicationInfo, Master}
 import org.apache.spark.deploy.worker.Worker
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.util.Utils
 
@@ -44,13 +44,13 @@ class AppClientSuite
     with Eventually
     with ScalaFutures {
   private val numWorkers = 2
-  private val conf = new SparkConf()
-  private val securityManager = new SecurityManager(conf)
 
+  private var conf: SparkConf = null
   private var masterRpcEnv: RpcEnv = null
   private var workerRpcEnvs: Seq[RpcEnv] = null
   private var master: Master = null
   private var workers: Seq[Worker] = null
+  private var securityManager: SecurityManager = null
 
   /**
    * Start the local cluster.
@@ -58,6 +58,8 @@ class AppClientSuite
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
+    conf = new SparkConf().set(config.Worker.WORKER_DECOMMISSION_ENABLED.key, "true")
+    securityManager = new SecurityManager(conf)
     masterRpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 0, conf, securityManager)
     workerRpcEnvs = (0 until numWorkers).map { i =>
       RpcEnv.create(Worker.SYSTEM_NAME + i, "localhost", 0, conf, securityManager)
@@ -111,8 +113,23 @@ class AppClientSuite
         assert(apps.head.getExecutorLimit === numExecutorsRequested, s"executor request failed")
       }
 
+
+      // Save the executor id before decommissioning so we can kill it
+      val application = getApplications().head
+      val executors = application.executors
+      val executorId: String = executors.head._2.fullId
+
+      // Send a decommission self to all the workers
+      // Note: normally the worker would send this on their own.
+      workers.foreach(worker => worker.decommissionSelf())
+
+      // Decommissioning is async.
+      eventually(timeout(1.seconds), interval(10.millis)) {
+        // We only record decommissioning for the executor we've requested
+        assert(ci.listener.execDecommissionedList.size === 1)
+      }
+
       // Send request to kill executor, verify request was made
-      val executorId: String = getApplications().head.executors.head._2.fullId
       whenReady(
         ci.client.killExecutors(Seq(executorId)),
         timeout(10.seconds),
@@ -120,6 +137,15 @@ class AppClientSuite
         assert(acknowledged)
       }
 
+      // Verify that asking for executors on the decommissioned workers fails
+      whenReady(
+        ci.client.requestTotalExecutors(numExecutorsRequested),
+        timeout(10.seconds),
+        interval(10.millis)) { acknowledged =>
+        assert(acknowledged)
+      }
+      assert(getApplications().head.executors.size === 0)
+
       // Issue stop command for Client to disconnect from Master
       ci.client.stop()
 
@@ -189,6 +215,7 @@ class AppClientSuite
     val deadReasonList = new ConcurrentLinkedQueue[String]()
     val execAddedList = new ConcurrentLinkedQueue[String]()
     val execRemovedList = new ConcurrentLinkedQueue[String]()
+    val execDecommissionedList = new ConcurrentLinkedQueue[String]()
 
     def connected(id: String): Unit = {
       connectedIdList.add(id)
@@ -218,6 +245,10 @@ class AppClientSuite
       execRemovedList.add(id)
     }
 
+    def executorDecommissioned(id: String, message: String): Unit = {
+      execDecommissionedList.add(id)
+    }
+
     def workerRemoved(workerId: String, host: String, message: String): Unit = {}
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 101e60c73e9f8..e40b63fe13cb1 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -167,6 +167,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     }
     override def setDAGScheduler(dagScheduler: DAGScheduler) = {}
     override def defaultParallelism() = 2
+    override def executorDecommission(executorId: String) = {}
     override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
     override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
     override def applicationAttemptId(): Option[String] = None
@@ -707,6 +708,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
           accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
           blockManagerId: BlockManagerId,
           executorUpdates: Map[(Int, Int), ExecutorMetrics]): Boolean = true
+      override def executorDecommission(executorId: String): Unit = {}
       override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
       override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
       override def applicationAttemptId(): Option[String] = None
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala
index 4e71ec1ea7b37..9f593e0039adc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala
@@ -89,6 +89,7 @@ private class DummyTaskScheduler extends TaskScheduler {
   override def notifyPartitionCompletion(stageId: Int, partitionId: Int): Unit = {}
   override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
   override def defaultParallelism(): Int = 2
+  override def executorDecommission(executorId: String): Unit = {}
   override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
   override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
   override def applicationAttemptId(): Option[String] = None
diff --git a/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala
new file mode 100644
index 0000000000000..15733b0d932ec
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.util.concurrent.Semaphore
+
+import scala.concurrent.TimeoutException
+import scala.concurrent.duration._
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
+import org.apache.spark.internal.config
+import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend
+import org.apache.spark.util.{RpcUtils, SerializableBuffer, ThreadUtils}
+
+class WorkerDecommissionSuite extends SparkFunSuite with LocalSparkContext {
+
+  override def beforeEach(): Unit = {
+    val conf = new SparkConf().setAppName("test").setMaster("local")
+      .set(config.Worker.WORKER_DECOMMISSION_ENABLED, true)
+
+    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
+  }
+
+  test("verify task with no decommissioning works as expected") {
+    val input = sc.parallelize(1 to 10)
+    input.count()
+    val sleepyRdd = input.mapPartitions{ x =>
+      Thread.sleep(100)
+      x
+    }
+    assert(sleepyRdd.count() === 10)
+  }
+
+  test("verify a task with all workers decommissioned succeeds") {
+    val input = sc.parallelize(1 to 10)
+    // Do a count to wait for the executors to be registered.
+    input.count()
+    val sleepyRdd = input.mapPartitions{ x =>
+      Thread.sleep(50)
+      x
+    }
+    // Listen for the job
+    val sem = new Semaphore(0)
+    sc.addSparkListener(new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        sem.release()
+      }
+    })
+    // Start the task.
+    val asyncCount = sleepyRdd.countAsync()
+    // Wait for the job to have started
+    sem.acquire(1)
+    // Decommission all the executors, this should not halt the current task.
+    // decom.sh message passing is tested manually.
+    val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend]
+    val execs = sched.getExecutorIds()
+    execs.foreach(execId => sched.decommissionExecutor(execId))
+    val asyncCountResult = ThreadUtils.awaitResult(asyncCount, 2.seconds)
+    assert(asyncCountResult === 10)
+    // Try and launch task after decommissioning, this should fail
+    val postDecommissioned = input.map(x => x)
+    val postDecomAsyncCount = postDecommissioned.countAsync()
+    val thrown = intercept[java.util.concurrent.TimeoutException]{
+      val result = ThreadUtils.awaitResult(postDecomAsyncCount, 2.seconds)
+    }
+    assert(postDecomAsyncCount.isCompleted === false,
+      "After exec decommission new task could not launch")
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala
index 09943b7974ed9..f42f3415baa15 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala
@@ -55,6 +55,9 @@ private[spark] abstract class KubernetesConf(val sparkConf: SparkConf) {
       }
   }
 
+  def workerDecommissioning: Boolean =
+    sparkConf.get(org.apache.spark.internal.config.Worker.WORKER_DECOMMISSION_ENABLED)
+
   def nodeSelector: Map[String, String] =
     KubernetesUtils.parsePrefixedKeyValuePairs(sparkConf, KUBERNETES_NODE_SELECTOR_PREFIX)
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
index 6a26df2997fd2..f575241de9540 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
@@ -24,6 +24,7 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.k8s._
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Python._
 import org.apache.spark.rpc.RpcEndpointAddress
@@ -33,7 +34,7 @@ import org.apache.spark.util.Utils
 private[spark] class BasicExecutorFeatureStep(
     kubernetesConf: KubernetesExecutorConf,
     secMgr: SecurityManager)
-  extends KubernetesFeatureConfigStep {
+  extends KubernetesFeatureConfigStep with Logging {
 
   // Consider moving some of these fields to KubernetesConf or KubernetesExecutorSpecificConf
   private val executorContainerImage = kubernetesConf
@@ -186,6 +187,21 @@ private[spark] class BasicExecutorFeatureStep(
           .endResources()
         .build()
     }.getOrElse(executorContainer)
+    val containerWithLifecycle =
+      if (!kubernetesConf.workerDecommissioning) {
+        logInfo("Decommissioning not enabled, skipping shutdown script")
+        containerWithLimitCores
+      } else {
+        logInfo("Adding decommission script to lifecycle")
+        new ContainerBuilder(containerWithLimitCores).withNewLifecycle()
+          .withNewPreStop()
+            .withNewExec()
+              .addToCommand("/opt/decom.sh")
+            .endExec()
+          .endPreStop()
+          .endLifecycle()
+          .build()
+      }
     val ownerReference = kubernetesConf.driverPod.map { pod =>
       new OwnerReferenceBuilder()
         .withController(true)
@@ -213,6 +229,6 @@ private[spark] class BasicExecutorFeatureStep(
     kubernetesConf.get(KUBERNETES_EXECUTOR_SCHEDULER_NAME)
       .foreach(executorPod.getSpec.setSchedulerName)
 
-    SparkPod(executorPod, containerWithLimitCores)
+    SparkPod(executorPod, containerWithLifecycle)
   }
 }
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index 6ed37fc637b31..cc65a7da12eef 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -30,7 +30,7 @@ ARG spark_uid=185
 RUN set -ex && \
     apt-get update && \
     ln -s /lib /lib64 && \
-    apt install -y bash tini libc6 libpam-modules krb5-user libnss3 && \
+    apt install -y bash tini libc6 libpam-modules krb5-user libnss3 procps && \
     mkdir -p /opt/spark && \
     mkdir -p /opt/spark/examples && \
     mkdir -p /opt/spark/work-dir && \
@@ -45,6 +45,7 @@ COPY jars /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/
+COPY kubernetes/dockerfiles/spark/decom.sh /opt/
 COPY examples /opt/spark/examples
 COPY kubernetes/tests /opt/spark/tests
 COPY data /opt/spark/data
@@ -53,6 +54,7 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark/work-dir
 RUN chmod g+w /opt/spark/work-dir
+RUN chmod a+x /opt/decom.sh
 
 ENTRYPOINT [ "/opt/entrypoint.sh" ]
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
new file mode 100755
index 0000000000000..8a5208d49a70f
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+set -ex
+echo "Asked to decommission"
+# Find the pid to signal
+date | tee -a ${LOG}
+WORKER_PID=$(ps -o pid -C java | tail -n 1| awk '{ sub(/^[ \t]+/, ""); print }')
+echo "Using worker pid $WORKER_PID"
+kill -s SIGPWR ${WORKER_PID}
+# For now we expect this to timeout, since we don't start exiting the backend.
+echo "Waiting for worker pid to exit"
+# If the worker does exit stop blocking the cleanup.
+timeout 60 tail --pid=${WORKER_PID} -f /dev/null
+date
+echo "Done"
+date
+sleep 30
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 6ee3523c8edab..05ab782caecae 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -30,9 +30,9 @@ set -e
 # If there is no passwd entry for the container UID, attempt to create one
 if [ -z "$uidentry" ] ; then
     if [ -w /etc/passwd ] ; then
-        echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
+	echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
     else
-        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
+	echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
     fi
 fi
 
@@ -59,7 +59,7 @@ fi
 # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
 # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
 if [ -n ${HADOOP_HOME}  ] && [ -z ${SPARK_DIST_CLASSPATH}  ]; then
-  export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)  
+  export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)
 fi
 
 if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
diff --git a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
index 607bb243458a6..292abe91d35b6 100755
--- a/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
+++ b/resource-managers/kubernetes/integration-tests/dev/dev-run-integration-tests.sh
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-set -xo errexit
+set -exo errexit
 TEST_ROOT_DIR=$(git rev-parse --show-toplevel)
 
 DEPLOY_MODE="minikube"
@@ -42,6 +42,9 @@ SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version 2>/dev/nu
     | grep -v "WARNING"\
     | tail -n 1)
 
+export SCALA_VERSION
+echo $SCALA_VERSION
+
 # Parse arguments
 while (( "$#" )); do
   case $1 in
@@ -110,7 +113,8 @@ while (( "$#" )); do
       shift
       ;;
     *)
-      break
+      echo "Unexpected command line flag $2 $1."
+      exit 1
       ;;
   esac
   shift
@@ -164,6 +168,7 @@ properties+=(
   -Dspark.kubernetes.test.jvmImage=$JVM_IMAGE_NAME
   -Dspark.kubernetes.test.pythonImage=$PYTHON_IMAGE_NAME
   -Dspark.kubernetes.test.rImage=$R_IMAGE_NAME
+  -Dlog4j.logger.org.apache.spark=DEBUG
 )
 
 $TEST_ROOT_DIR/build/mvn integration-test -f $TEST_ROOT_DIR/pom.xml -pl resource-managers/kubernetes/integration-tests -am -Pscala-$SCALA_VERSION -Pkubernetes -Pkubernetes-integration-tests ${properties[@]}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala
new file mode 100644
index 0000000000000..f5eab6e4bbad6
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.integrationtest
+
+import org.apache.spark.internal.config.Worker
+
+private[spark] trait DecommissionSuite { k8sSuite: KubernetesSuite =>
+
+  import DecommissionSuite._
+  import KubernetesSuite.k8sTestTag
+
+  test("Test basic decommissioning", k8sTestTag) {
+    sparkAppConf
+      .set(Worker.WORKER_DECOMMISSION_ENABLED.key, "true")
+      .set("spark.kubernetes.pyspark.pythonVersion", "3")
+      .set("spark.kubernetes.container.image", pyImage)
+
+    runSparkApplicationAndVerifyCompletion(
+      appResource = PYSPARK_DECOMISSIONING,
+      mainClass = "",
+      expectedLogOnCompletion = Seq("decommissioning executor",
+        "Finished waiting, stopping Spark"),
+      appArgs = Array.empty[String],
+      driverPodChecker = doBasicDriverPyPodCheck,
+      executorPodChecker = doBasicExecutorPyPodCheck,
+      appLocator = appLocator,
+      isJVM = false,
+      decommissioningTest = true)
+  }
+}
+
+private[spark] object DecommissionSuite {
+  val TEST_LOCAL_PYSPARK: String = "local:///opt/spark/tests/"
+  val PYSPARK_DECOMISSIONING: String = TEST_LOCAL_PYSPARK + "decommissioning.py"
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
index 0d4fcccc35cf9..61e1f27b55462 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala
@@ -42,7 +42,9 @@ import org.apache.spark.internal.config._
 class KubernetesSuite extends SparkFunSuite
   with BeforeAndAfterAll with BeforeAndAfter with BasicTestsSuite with SecretsTestsSuite
   with PythonTestsSuite with ClientModeTestsSuite with PodTemplateSuite with PVTestsSuite
-  with DepsTestsSuite with RTestsSuite with Logging with Eventually with Matchers {
+  with DepsTestsSuite with DecommissionSuite with RTestsSuite with Logging with Eventually
+  with Matchers {
+
 
   import KubernetesSuite._
 
@@ -254,6 +256,7 @@ class KubernetesSuite extends SparkFunSuite
     }
   }
 
+  // scalastyle:off argcount
   protected def runSparkApplicationAndVerifyCompletion(
       appResource: String,
       mainClass: String,
@@ -264,60 +267,120 @@ class KubernetesSuite extends SparkFunSuite
       appLocator: String,
       isJVM: Boolean,
       pyFiles: Option[String] = None,
-      executorPatience: Option[(Option[Interval], Option[Timeout])] = None): Unit = {
+      executorPatience: Option[(Option[Interval], Option[Timeout])] = None,
+      decommissioningTest: Boolean = false): Unit = {
+
+  // scalastyle:on argcount
     val appArguments = SparkAppArguments(
       mainAppResource = appResource,
       mainClass = mainClass,
       appArgs = appArgs)
-    SparkAppLauncher.launch(
-      appArguments,
-      sparkAppConf,
-      TIMEOUT.value.toSeconds.toInt,
-      sparkHomeDir,
-      isJVM,
-      pyFiles)
 
-    val driverPod = kubernetesTestComponents.kubernetesClient
-      .pods()
-      .withLabel("spark-app-locator", appLocator)
-      .withLabel("spark-role", "driver")
-      .list()
-      .getItems
-      .get(0)
-    driverPodChecker(driverPod)
     val execPods = scala.collection.mutable.Map[String, Pod]()
+    val (patienceInterval, patienceTimeout) = {
+      executorPatience match {
+        case Some(patience) => (patience._1.getOrElse(INTERVAL), patience._2.getOrElse(TIMEOUT))
+        case _ => (INTERVAL, TIMEOUT)
+      }
+    }
+    def checkPodReady(namespace: String, name: String) = {
+      val execPod = kubernetesTestComponents.kubernetesClient
+        .pods()
+        .inNamespace(namespace)
+        .withName(name)
+        .get()
+      val resourceStatus = execPod.getStatus
+      val conditions = resourceStatus.getConditions().asScala
+      val conditionTypes = conditions.map(_.getType())
+      val readyConditions = conditions.filter{cond => cond.getType() == "Ready"}
+      val result = readyConditions
+        .map(cond => cond.getStatus() == "True")
+        .headOption.getOrElse(false)
+      result
+    }
     val execWatcher = kubernetesTestComponents.kubernetesClient
       .pods()
       .withLabel("spark-app-locator", appLocator)
       .withLabel("spark-role", "executor")
       .watch(new Watcher[Pod] {
-        logInfo("Beginning watch of executors")
+        logDebug("Beginning watch of executors")
         override def onClose(cause: KubernetesClientException): Unit =
           logInfo("Ending watch of executors")
         override def eventReceived(action: Watcher.Action, resource: Pod): Unit = {
           val name = resource.getMetadata.getName
+          val namespace = resource.getMetadata().getNamespace()
           action match {
-            case Action.ADDED | Action.MODIFIED =>
+            case Action.MODIFIED =>
+              execPods(name) = resource
+            case Action.ADDED =>
+              logDebug(s"Add event received for $name.")
               execPods(name) = resource
+              // If testing decommissioning start a thread to simulate
+              // decommissioning.
+              if (decommissioningTest && execPods.size == 1) {
+                // Wait for all the containers in the pod to be running
+                logDebug("Waiting for first pod to become OK prior to deletion")
+                Eventually.eventually(patienceTimeout, patienceInterval) {
+                  val result = checkPodReady(namespace, name)
+                  result shouldBe (true)
+                }
+                // Sleep a small interval to allow execution of job
+                logDebug("Sleeping before killing pod.")
+                Thread.sleep(2000)
+                // Delete the pod to simulate cluster scale down/migration.
+                val pod = kubernetesTestComponents.kubernetesClient.pods().withName(name)
+                pod.delete()
+                logDebug(s"Triggered pod decom/delete: $name deleted")
+              }
             case Action.DELETED | Action.ERROR =>
               execPods.remove(name)
           }
         }
       })
 
-    val (patienceInterval, patienceTimeout) = {
-      executorPatience match {
-        case Some(patience) => (patience._1.getOrElse(INTERVAL), patience._2.getOrElse(TIMEOUT))
-        case _ => (INTERVAL, TIMEOUT)
-      }
-    }
+    logDebug("Starting Spark K8s job")
+    SparkAppLauncher.launch(
+      appArguments,
+      sparkAppConf,
+      TIMEOUT.value.toSeconds.toInt,
+      sparkHomeDir,
+      isJVM,
+      pyFiles)
 
+    val driverPod = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withLabel("spark-app-locator", appLocator)
+      .withLabel("spark-role", "driver")
+      .list()
+      .getItems
+      .get(0)
+
+    driverPodChecker(driverPod)
+    // If we're testing decommissioning we delete all the executors, but we should have
+    // an executor at some point.
     Eventually.eventually(patienceTimeout, patienceInterval) {
       execPods.values.nonEmpty should be (true)
     }
+    // If decommissioning we need to wait and check the executors were removed
+    if (decommissioningTest) {
+      // Sleep a small interval to ensure everything is registered.
+      Thread.sleep(100)
+      // Wait for the executors to become ready
+      Eventually.eventually(patienceTimeout, patienceInterval) {
+        val anyReadyPods = ! execPods.map{
+          case (name, resource) =>
+            (name, resource.getMetadata().getNamespace())
+        }.filter{
+          case (name, namespace) => checkPodReady(namespace, name)
+        }.isEmpty
+        val podsEmpty = execPods.values.isEmpty
+        val podsReadyOrDead = anyReadyPods || podsEmpty
+        podsReadyOrDead shouldBe (true)
+      }
+    }
     execWatcher.close()
     execPods.values.foreach(executorPodChecker(_))
-    Eventually.eventually(TIMEOUT, patienceInterval) {
+    Eventually.eventually(patienceTimeout, patienceInterval) {
       expectedLogOnCompletion.foreach { e =>
         assert(kubernetesTestComponents.kubernetesClient
           .pods()
@@ -425,5 +488,5 @@ private[spark] object KubernetesSuite {
   val SPARK_REMOTE_MAIN_CLASS: String = "org.apache.spark.examples.SparkRemoteFileTest"
   val SPARK_DRIVER_MAIN_CLASS: String = "org.apache.spark.examples.DriverSubmissionTest"
   val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
-  val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
+  val INTERVAL = PatienceConfiguration.Interval(Span(1, Seconds))
 }
diff --git a/resource-managers/kubernetes/integration-tests/tests/decommissioning.py b/resource-managers/kubernetes/integration-tests/tests/decommissioning.py
new file mode 100644
index 0000000000000..f68f24d49763d
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/tests/decommissioning.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import time
+
+from pyspark.sql import SparkSession
+
+
+if __name__ == "__main__":
+    """
+        Usage: decommissioning
+    """
+    print("Starting decom test")
+    spark = SparkSession \
+        .builder \
+        .appName("PyMemoryTest") \
+        .getOrCreate()
+    sc = spark._sc
+    rdd = sc.parallelize(range(10))
+    rdd.collect()
+    print("Waiting to give nodes time to finish.")
+    time.sleep(5)
+    rdd.collect()
+    print("Waiting some more....")
+    time.sleep(10)
+    rdd.collect()
+    print("Finished waiting, stopping Spark.")
+    spark.stop()
+    print("Done, exiting Python")
+    sys.exit(0)
diff --git a/sbin/decommission-slave.sh b/sbin/decommission-slave.sh
new file mode 100644
index 0000000000000..4bbf257ff1d3a
--- /dev/null
+++ b/sbin/decommission-slave.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A shell script to decommission all workers on a single slave
+#
+# Environment variables
+#
+#   SPARK_WORKER_INSTANCES The number of worker instances that should be
+#                          running on this slave.  Default is 1.
+
+# Usage: decommission-slave.sh [--block-until-exit]
+#   Decommissions all slaves on this worker machine
+
+set -ex
+
+if [ -z "${SPARK_HOME}" ]; then
+  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+. "${SPARK_HOME}/sbin/spark-config.sh"
+
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
+  "${SPARK_HOME}/sbin"/spark-daemon.sh decommission org.apache.spark.deploy.worker.Worker 1
+else
+  for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do
+    "${SPARK_HOME}/sbin"/spark-daemon.sh decommission org.apache.spark.deploy.worker.Worker $(( $i + 1 ))
+  done
+fi
+
+# Check if --block-until-exit is set.
+# This is done for systems which block on the decomissioning script and on exit
+# shut down the entire system (e.g. K8s).
+if [ "$1" == "--block-until-exit" ]; then
+  shift
+  # For now we only block on the 0th instance if there multiple instances.
+  instance=$1
+  pid="$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid"
+  wait $pid
+fi
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 6de67e039b48f..81f2fd40a706f 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -215,6 +215,21 @@ case $option in
     fi
     ;;
 
+  (decommission)
+
+    if [ -f $pid ]; then
+      TARGET_ID="$(cat "$pid")"
+      if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
+        echo "decommissioning $command"
+        kill -s SIGPWR "$TARGET_ID"
+      else
+        echo "no $command to decommission"
+      fi
+    else
+      echo "no $command to decommission"
+    fi
+    ;;
+
   (status)
 
     if [ -f $pid ]; then

From d0f961476031b62bda0d4d41f7248295d651ea92 Mon Sep 17 00:00:00 2001
From: DB Tsai <d_tsai@apple.com>
Date: Fri, 14 Feb 2020 21:46:01 +0000
Subject: [PATCH 051/185] [SPARK-30289][SQL] Partitioned by Nested Column for
 `InMemoryTable`

### What changes were proposed in this pull request?
1. `InMemoryTable` was flatting the nested columns, and then the flatten columns was used to look up the indices which is not correct.

This PR implements partitioned by nested column for `InMemoryTable`.

### Why are the changes needed?

This PR implements partitioned by nested column for `InMemoryTable`, so we can test this features in DSv2

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing unit tests and new tests.

Closes #26929 from dbtsai/addTests.

Authored-by: DB Tsai <d_tsai@apple.com>
Signed-off-by: DB Tsai <d_tsai@apple.com>
---
 .../spark/sql/connector/InMemoryTable.scala   | 35 ++++++--
 .../spark/sql/DataFrameWriterV2Suite.scala    | 86 ++++++++++++++++++-
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala
index c9e4e0aad5704..0187ae31e2d1c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala
@@ -26,7 +26,7 @@ import org.scalatest.Assertions._
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.catalog._
-import org.apache.spark.sql.connector.expressions.{IdentityTransform, Transform}
+import org.apache.spark.sql.connector.expressions.{IdentityTransform, NamedReference, Transform}
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.connector.write._
 import org.apache.spark.sql.sources.{And, EqualTo, Filter, IsNotNull}
@@ -59,10 +59,30 @@ class InMemoryTable(
 
   def rows: Seq[InternalRow] = dataMap.values.flatMap(_.rows).toSeq
 
-  private val partFieldNames = partitioning.flatMap(_.references).toSeq.flatMap(_.fieldNames)
-  private val partIndexes = partFieldNames.map(schema.fieldIndex)
+  private val partCols: Array[Array[String]] = partitioning.flatMap(_.references).map { ref =>
+    schema.findNestedField(ref.fieldNames(), includeCollections = false) match {
+      case Some(_) => ref.fieldNames()
+      case None => throw new IllegalArgumentException(s"${ref.describe()} does not exist.")
+    }
+  }
 
-  private def getKey(row: InternalRow): Seq[Any] = partIndexes.map(row.toSeq(schema)(_))
+  private def getKey(row: InternalRow): Seq[Any] = {
+    def extractor(fieldNames: Array[String], schema: StructType, row: InternalRow): Any = {
+      val index = schema.fieldIndex(fieldNames(0))
+      val value = row.toSeq(schema).apply(index)
+      if (fieldNames.length > 1) {
+        (value, schema(index).dataType) match {
+          case (row: InternalRow, nestedSchema: StructType) =>
+            extractor(fieldNames.drop(1), nestedSchema, row)
+          case (_, dataType) =>
+            throw new IllegalArgumentException(s"Unsupported type, ${dataType.simpleString}")
+        }
+      } else {
+        value
+      }
+    }
+    partCols.map(fieldNames => extractor(fieldNames, schema, row))
+  }
 
   def withData(data: Array[BufferedRows]): InMemoryTable = dataMap.synchronized {
     data.foreach(_.rows.foreach { row =>
@@ -146,8 +166,10 @@ class InMemoryTable(
   }
 
   private class Overwrite(filters: Array[Filter]) extends TestBatchWrite {
+    import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
     override def commit(messages: Array[WriterCommitMessage]): Unit = dataMap.synchronized {
-      val deleteKeys = InMemoryTable.filtersToKeys(dataMap.keys, partFieldNames, filters)
+      val deleteKeys = InMemoryTable.filtersToKeys(
+        dataMap.keys, partCols.map(_.toSeq.quoted), filters)
       dataMap --= deleteKeys
       withData(messages.map(_.asInstanceOf[BufferedRows]))
     }
@@ -161,7 +183,8 @@ class InMemoryTable(
   }
 
   override def deleteWhere(filters: Array[Filter]): Unit = dataMap.synchronized {
-    dataMap --= InMemoryTable.filtersToKeys(dataMap.keys, partFieldNames, filters)
+    import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
+    dataMap --= InMemoryTable.filtersToKeys(dataMap.keys, partCols.map(_.toSeq.quoted), filters)
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala
index d49dc58e93ddb..cd157086a8b8e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala
@@ -17,20 +17,24 @@
 
 package org.apache.spark.sql
 
+import java.sql.Timestamp
+
 import scala.collection.JavaConverters._
 
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic}
-import org.apache.spark.sql.connector.InMemoryTableCatalog
+import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog}
 import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
 import org.apache.spark.sql.connector.expressions.{BucketTransform, DaysTransform, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, YearsTransform}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
+import org.apache.spark.sql.types.TimestampType
 import org.apache.spark.sql.util.QueryExecutionListener
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 
 class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with BeforeAndAfter {
@@ -550,4 +554,84 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo
     assert(replaced.partitioning.isEmpty)
     assert(replaced.properties === defaultOwnership.asJava)
   }
+
+  test("SPARK-30289 Create: partitioned by nested column") {
+    val schema = new StructType().add("ts", new StructType()
+      .add("created", TimestampType)
+      .add("modified", TimestampType)
+      .add("timezone", StringType))
+
+    val data = Seq(
+      Row(Row(Timestamp.valueOf("2019-06-01 10:00:00"), Timestamp.valueOf("2019-09-02 07:00:00"),
+        "America/Los_Angeles")),
+      Row(Row(Timestamp.valueOf("2019-08-26 18:00:00"), Timestamp.valueOf("2019-09-26 18:00:00"),
+        "America/Los_Angeles")),
+      Row(Row(Timestamp.valueOf("2018-11-23 18:00:00"), Timestamp.valueOf("2018-12-22 18:00:00"),
+        "America/New_York")))
+    val df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema)
+
+    df.writeTo("testcat.table_name")
+      .partitionedBy($"ts.timezone")
+      .create()
+
+    val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name"))
+      .asInstanceOf[InMemoryTable]
+
+    assert(table.name === "testcat.table_name")
+    assert(table.partitioning === Seq(IdentityTransform(FieldReference(Array("ts", "timezone")))))
+    checkAnswer(spark.table(table.name), data)
+    assert(table.dataMap.toArray.length == 2)
+    assert(table.dataMap(Seq(UTF8String.fromString("America/Los_Angeles"))).rows.size == 2)
+    assert(table.dataMap(Seq(UTF8String.fromString("America/New_York"))).rows.size == 1)
+
+    // TODO: `DataSourceV2Strategy` can not translate nested fields into source filter yet
+    // so the following sql will fail.
+    // sql("DELETE FROM testcat.table_name WHERE ts.timezone = \"America/Los_Angeles\"")
+  }
+
+  test("SPARK-30289 Create: partitioned by multiple transforms on nested columns") {
+    spark.table("source")
+      .withColumn("ts", struct(
+        lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created",
+        lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified",
+        lit("America/Los_Angeles") as "timezone"))
+      .writeTo("testcat.table_name")
+      .tableProperty("allow-unsupported-transforms", "true")
+      .partitionedBy(
+        years($"ts.created"), months($"ts.created"), days($"ts.created"), hours($"ts.created"),
+        years($"ts.modified"), months($"ts.modified"), days($"ts.modified"), hours($"ts.modified")
+      )
+      .create()
+
+    val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name"))
+
+    assert(table.name === "testcat.table_name")
+    assert(table.partitioning === Seq(
+      YearsTransform(FieldReference(Array("ts", "created"))),
+      MonthsTransform(FieldReference(Array("ts", "created"))),
+      DaysTransform(FieldReference(Array("ts", "created"))),
+      HoursTransform(FieldReference(Array("ts", "created"))),
+      YearsTransform(FieldReference(Array("ts", "modified"))),
+      MonthsTransform(FieldReference(Array("ts", "modified"))),
+      DaysTransform(FieldReference(Array("ts", "modified"))),
+      HoursTransform(FieldReference(Array("ts", "modified")))))
+  }
+
+  test("SPARK-30289 Create: partitioned by bucket(4, ts.timezone)") {
+    spark.table("source")
+      .withColumn("ts", struct(
+        lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created",
+        lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified",
+        lit("America/Los_Angeles") as "timezone"))
+      .writeTo("testcat.table_name")
+      .tableProperty("allow-unsupported-transforms", "true")
+      .partitionedBy(bucket(4, $"ts.timezone"))
+      .create()
+
+    val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name"))
+
+    assert(table.name === "testcat.table_name")
+    assert(table.partitioning === Seq(BucketTransform(LiteralValue(4, IntegerType),
+      Seq(FieldReference(Seq("ts", "timezone"))))))
+  }
 }

From 8b73b92aadd685b29ef3d9b33366f5e1fd3dae99 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Sat, 15 Feb 2020 19:49:58 +0800
Subject: [PATCH 052/185] [SPARK-30826][SQL] Respect reference case in
 `StringStartsWith` pushed down to parquet

### What changes were proposed in this pull request?
In the PR, I propose to convert the attribute name of `StringStartsWith` pushed down to the Parquet datasource to column reference via the `nameToParquetField` map. Similar conversions are performed for other source filters pushed down to parquet.

### Why are the changes needed?
This fixes the bug described in [SPARK-30826](https://issues.apache.org/jira/browse/SPARK-30826). The query from an external table:
```sql
CREATE TABLE t1 (col STRING)
USING parquet
OPTIONS (path '$path')
```
created on top of written parquet files by `Seq("42").toDF("COL").write.parquet(path)` returns wrong empty result:
```scala
spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'").show
+---+
|col|
+---+
+---+
```

### Does this PR introduce any user-facing change?
Yes. After the changes the result is correct for the example above:
```scala
spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'").show
+---+
|col|
+---+
| 42|
+---+
```

### How was this patch tested?
Added a test to `ParquetFilterSuite`

Closes #27574 from MaxGekk/parquet-StringStartsWith-case-sens.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/parquet/ParquetFilters.scala  |  2 +-
 .../parquet/ParquetFilterSuite.scala          | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index b9b86adb438e6..948a120e0d6e0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -591,7 +591,7 @@ class ParquetFilters(
       case sources.StringStartsWith(name, prefix)
           if pushDownStartWith && canMakeFilterOn(name, prefix) =>
         Option(prefix).map { v =>
-          FilterApi.userDefined(binaryColumn(name),
+          FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldName),
             new UserDefinedPredicate[Binary] with Serializable {
               private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
               private val size = strToBinary.length
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 286bb1e920266..4e0c1c2dbe601 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -1390,6 +1390,27 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
       }
     }
   }
+
+  test("SPARK-30826: case insensitivity of StringStartsWith attribute") {
+    import testImplicits._
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTable("t1") {
+        withTempPath { dir =>
+          val path = dir.toURI.toString
+          Seq("42").toDF("COL").write.parquet(path)
+          spark.sql(
+            s"""
+               |CREATE TABLE t1 (col STRING)
+               |USING parquet
+               |OPTIONS (path '$path')
+           """.stripMargin)
+          checkAnswer(
+            spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'"),
+            Row("42"))
+        }
+      }
+    }
+  }
 }
 
 class ParquetV1FilterSuite extends ParquetFilterSuite {

From f5238ea6cb0d2cfa69ae0488df94b29cc50065e0 Mon Sep 17 00:00:00 2001
From: "Wu, Xiaochang" <xiaochang.wu@intel.com>
Date: Sun, 16 Feb 2020 09:51:02 -0600
Subject: [PATCH 053/185] [GRAPHX][MINOR] Fix typo setRest => setDest

### What changes were proposed in this pull request?
Fix typo

def setRest(dstId: VertexId, localDstId: Int, dstAttr: VD, attr: ED)

to

def setDest(dstId: VertexId, localDstId: Int, dstAttr: VD, attr: ED)

### Why are the changes needed?
Typo

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
N/A

Closes #27594 from xwu99/fix-graphx-setDest.

Authored-by: Wu, Xiaochang <xiaochang.wu@intel.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../scala/org/apache/spark/graphx/impl/EdgePartition.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
index 8d03112a1c3dc..c0a2ba67d2942 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
@@ -465,7 +465,7 @@ class EdgePartition[
           if (edgeIsActive) {
             val dstAttr =
               if (tripletFields.useDst) vertexAttrs(localDstId) else null.asInstanceOf[VD]
-            ctx.setRest(dstId, localDstId, dstAttr, data(pos))
+            ctx.setDest(dstId, localDstId, dstAttr, data(pos))
             sendMsg(ctx)
           }
           pos += 1
@@ -511,7 +511,7 @@ private class AggregatingEdgeContext[VD, ED, A](
     _srcAttr = srcAttr
   }
 
-  def setRest(dstId: VertexId, localDstId: Int, dstAttr: VD, attr: ED): Unit = {
+  def setDest(dstId: VertexId, localDstId: Int, dstAttr: VD, attr: ED): Unit = {
     _dstId = dstId
     _localDstId = localDstId
     _dstAttr = dstAttr

From 0a03e7e679771da8556fae72b35edf21ae71ac44 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Sun, 16 Feb 2020 09:53:12 -0600
Subject: [PATCH 054/185] [SPARK-30691][SQL][DOC][FOLLOW-UP] Make link names
 exactly the same as the side bar names

### What changes were proposed in this pull request?
Make link names exactly the same as the side bar names

### Why are the changes needed?
Make doc look better

### Does this PR introduce any user-facing change?
before:

![image](https://user-images.githubusercontent.com/13592258/74578603-ad300100-4f4a-11ea-8430-11fccf31eab4.png)

after:

![image](https://user-images.githubusercontent.com/13592258/74578670-eff1d900-4f4a-11ea-97d8-5908c0e50e95.png)

### How was this patch tested?
Manually build and check the docs

Closes #27591 from huaxingao/spark-doc-followup.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 docs/_data/menu-sql.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
index 1e343f630f88e..38a5cf61245a6 100644
--- a/docs/_data/menu-sql.yaml
+++ b/docs/_data/menu-sql.yaml
@@ -157,12 +157,12 @@
         - text: Auxiliary Statements
           url: sql-ref-syntax-aux.html
           subitems:
-            - text: Analyze statement
+            - text: ANALYZE
               url: sql-ref-syntax-aux-analyze.html
               subitems: 
                 - text: ANALYZE TABLE
                   url: sql-ref-syntax-aux-analyze-table.html
-            - text: Caching statements 
+            - text: CACHE
               url: sql-ref-syntax-aux-cache.html
               subitems:
                 - text: CACHE TABLE
@@ -175,7 +175,7 @@
                   url: sql-ref-syntax-aux-refresh-table.html
                 - text: REFRESH
                   url: sql-ref-syntax-aux-cache-refresh.md
-            - text: Describe Commands
+            - text: DESCRIBE
               url: sql-ref-syntax-aux-describe.html
               subitems:
                 - text: DESCRIBE DATABASE
@@ -186,7 +186,7 @@
                   url: sql-ref-syntax-aux-describe-function.html
                 - text: DESCRIBE QUERY
                   url: sql-ref-syntax-aux-describe-query.html
-            - text: Show commands
+            - text: SHOW
               url: sql-ref-syntax-aux-show.html
               subitems:
                 - text: SHOW COLUMNS 
@@ -205,14 +205,14 @@
                   url: sql-ref-syntax-aux-show-partitions.html
                 - text: SHOW CREATE TABLE
                   url: sql-ref-syntax-aux-show-create-table.html
-            - text: Configuration Management Commands
+            - text: CONFIGURATION MANAGEMENT
               url: sql-ref-syntax-aux-conf-mgmt.html
               subitems:
                 - text: SET 
                   url: sql-ref-syntax-aux-conf-mgmt-set.html
                 - text: RESET
                   url: sql-ref-syntax-aux-conf-mgmt-reset.html
-            - text: Resource Management Commands
+            - text: RESOURCE MANAGEMENT
               url: sql-ref-syntax-aux-resource-mgmt.html
               subitems:
                 - text: ADD FILE

From 01cc852982cd065e08f9a652c14a0514f49fb662 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Sun, 16 Feb 2020 09:55:03 -0600
Subject: [PATCH 055/185] [SPARK-30803][DOCS] Fix the home page link for Scala
 API document

### What changes were proposed in this pull request?
Change the link to the Scala API document.

```
$ git grep "#org.apache.spark.package"
docs/_layouts/global.html:                                <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
docs/index.md:* [Spark Scala API (Scaladoc)](api/scala/index.html#org.apache.spark.package)
docs/rdd-programming-guide.md:[Scala](api/scala/#org.apache.spark.package), [Java](api/java/), [Python](api/python/) and [R](api/R/).
```

### Why are the changes needed?
The home page link for Scala API document is incorrect after upgrade to 3.0

### Does this PR introduce any user-facing change?
Document UI change only.

### How was this patch tested?
Local test, attach screenshots below:
Before:
![image](https://user-images.githubusercontent.com/4833765/74335713-c2385300-4dd7-11ea-95d8-f5a3639d2578.png)
After:
![image](https://user-images.githubusercontent.com/4833765/74335727-cbc1bb00-4dd7-11ea-89d9-4dcc1310e679.png)

Closes #27549 from xuanyuanking/scala-doc.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 docs/_layouts/global.html                     |  2 +-
 docs/configuration.md                         |  8 +-
 docs/graphx-programming-guide.md              | 68 +++++++-------
 docs/index.md                                 |  2 +-
 docs/ml-advanced.md                           | 10 +-
 docs/ml-classification-regression.md          | 40 ++++----
 docs/ml-clustering.md                         | 10 +-
 docs/ml-collaborative-filtering.md            |  2 +-
 docs/ml-datasource.md                         |  4 +-
 docs/ml-features.md                           | 92 +++++++++----------
 docs/ml-frequent-pattern-mining.md            |  4 +-
 docs/ml-migration-guide.md                    | 36 ++++----
 docs/ml-pipeline.md                           | 10 +-
 docs/ml-statistics.md                         |  8 +-
 docs/ml-tuning.md                             | 18 ++--
 docs/mllib-clustering.md                      | 26 +++---
 docs/mllib-collaborative-filtering.md         |  4 +-
 docs/mllib-data-types.md                      | 48 +++++-----
 docs/mllib-decision-tree.md                   | 10 +-
 docs/mllib-dimensionality-reduction.md        |  6 +-
 docs/mllib-ensembles.md                       | 10 +-
 docs/mllib-evaluation-metrics.md              |  8 +-
 docs/mllib-feature-extraction.md              | 34 +++----
 docs/mllib-frequent-pattern-mining.md         | 14 +--
 docs/mllib-isotonic-regression.md             |  2 +-
 docs/mllib-linear-methods.md                  | 22 ++---
 docs/mllib-naive-bayes.md                     |  8 +-
 docs/mllib-optimization.md                    | 14 +--
 docs/mllib-pmml-model-export.md               |  2 +-
 docs/mllib-statistics.md                      | 28 +++---
 docs/quick-start.md                           |  2 +-
 docs/rdd-programming-guide.md                 | 28 +++---
 docs/sql-data-sources-generic-options.md      |  2 +-
 docs/sql-data-sources-jdbc.md                 |  2 +-
 docs/sql-data-sources-json.md                 |  2 +-
 docs/sql-getting-started.md                   | 16 ++--
 docs/sql-migration-guide.md                   |  4 +-
 docs/sql-programming-guide.md                 |  2 +-
 docs/sql-ref-syntax-aux-analyze-table.md      |  2 +-
 docs/sql-ref-syntax-aux-cache-refresh.md      |  2 +-
 docs/sql-ref-syntax-aux-refresh-table.md      |  2 +-
 docs/sql-ref-syntax-aux-resource-mgmt.md      |  2 +-
 docs/sql-ref-syntax-aux-show-tables.md        |  2 +-
 docs/sql-ref-syntax-aux-show.md               |  2 +-
 docs/sql-ref-syntax-ddl-drop-database.md      |  2 +-
 docs/sql-ref-syntax-ddl-drop-function.md      |  2 +-
 ...tax-dml-insert-overwrite-directory-hive.md |  2 +-
 ...f-syntax-dml-insert-overwrite-directory.md |  2 +-
 docs/sql-ref-syntax-dml.md                    |  2 +-
 docs/sql-ref-syntax-qry-select-clusterby.md   |  2 +-
 ...sql-ref-syntax-qry-select-distribute-by.md |  2 +-
 docs/sql-ref-syntax-qry-select-sortby.md      |  2 +-
 docs/sql-ref-syntax-qry-select.md             |  2 +-
 docs/streaming-custom-receivers.md            |  2 +-
 docs/streaming-kafka-integration.md           |  2 +-
 docs/streaming-kinesis-integration.md         |  2 +-
 docs/streaming-programming-guide.md           | 42 ++++-----
 .../structured-streaming-programming-guide.md | 22 ++---
 docs/tuning.md                                |  2 +-
 59 files changed, 355 insertions(+), 355 deletions(-)

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index d5fb18bfb06c0..d05ac6bbe129d 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -82,7 +82,7 @@
                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                             <ul class="dropdown-menu">
-                                <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
+                                <li><a href="api/scala/org/apache/spark/index.html">Scala</a></li>
                                 <li><a href="api/java/index.html">Java</a></li>
                                 <li><a href="api/python/index.html">Python</a></li>
                                 <li><a href="api/R/index.html">R</a></li>
diff --git a/docs/configuration.md b/docs/configuration.md
index 1343755f9d87f..9c9cf84151b20 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -24,7 +24,7 @@ license: |
 Spark provides three locations to configure the system:
 
 * [Spark properties](#spark-properties) control most application parameters and can be set by using
-  a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object, or through Java
+  a [SparkConf](api/scala/org/apache/spark/SparkConf.html) object, or through Java
   system properties.
 * [Environment variables](#environment-variables) can be used to set per-machine settings, such as
   the IP address, through the `conf/spark-env.sh` script on each node.
@@ -34,7 +34,7 @@ Spark provides three locations to configure the system:
 
 Spark properties control most application settings and are configured separately for each
 application. These properties can be set directly on a
-[SparkConf](api/scala/index.html#org.apache.spark.SparkConf) passed to your
+[SparkConf](api/scala/org/apache/spark/SparkConf.html) passed to your
 `SparkContext`. `SparkConf` allows you to configure some of the common properties
 (e.g. master URL and application name), as well as arbitrary key-value pairs through the
 `set()` method. For example, we could initialize an application with two threads as follows:
@@ -1326,7 +1326,7 @@ Apart from these, the following properties are also available, and may be useful
     property is useful if you need to register your classes in a custom way, e.g. to specify a custom
     field serializer. Otherwise <code>spark.kryo.classesToRegister</code> is simpler. It should be
     set to classes that extend
-    <a href="api/scala/index.html#org.apache.spark.serializer.KryoRegistrator">
+    <a href="api/scala/org/apache/spark/serializer/KryoRegistrator.html">
     <code>KryoRegistrator</code></a>.
     See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
   </td>
@@ -1379,7 +1379,7 @@ Apart from these, the following properties are also available, and may be useful
     but is quite slow, so we recommend <a href="tuning.html">using
     <code>org.apache.spark.serializer.KryoSerializer</code> and configuring Kryo serialization</a>
     when speed is necessary. Can be any subclass of
-    <a href="api/scala/index.html#org.apache.spark.serializer.Serializer">
+    <a href="api/scala/org/apache/spark/serializer/Serializer.html">
     <code>org.apache.spark.Serializer</code></a>.
   </td>
 </tr>
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 167c44aa1b2e9..50c9366a0999f 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -25,38 +25,38 @@ license: |
 
 <!-- All the documentation links  -->
 
-[EdgeRDD]: api/scala/index.html#org.apache.spark.graphx.EdgeRDD
-[VertexRDD]: api/scala/index.html#org.apache.spark.graphx.VertexRDD
-[Edge]: api/scala/index.html#org.apache.spark.graphx.Edge
-[EdgeTriplet]: api/scala/index.html#org.apache.spark.graphx.EdgeTriplet
-[Graph]: api/scala/index.html#org.apache.spark.graphx.Graph
-[GraphOps]: api/scala/index.html#org.apache.spark.graphx.GraphOps
-[Graph.mapVertices]: api/scala/index.html#org.apache.spark.graphx.Graph@mapVertices[VD2]((VertexId,VD)⇒VD2)(ClassTag[VD2]):Graph[VD2,ED]
-[Graph.reverse]: api/scala/index.html#org.apache.spark.graphx.Graph@reverse:Graph[VD,ED]
-[Graph.subgraph]: api/scala/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexId,VD)⇒Boolean):Graph[VD,ED]
-[Graph.mask]: api/scala/index.html#org.apache.spark.graphx.Graph@mask[VD2,ED2](Graph[VD2,ED2])(ClassTag[VD2],ClassTag[ED2]):Graph[VD,ED]
-[Graph.groupEdges]: api/scala/index.html#org.apache.spark.graphx.Graph@groupEdges((ED,ED)⇒ED):Graph[VD,ED]
-[GraphOps.joinVertices]: api/scala/index.html#org.apache.spark.graphx.GraphOps@joinVertices[U](RDD[(VertexId,U)])((VertexId,VD,U)⇒VD)(ClassTag[U]):Graph[VD,ED]
-[Graph.outerJoinVertices]: api/scala/index.html#org.apache.spark.graphx.Graph@outerJoinVertices[U,VD2](RDD[(VertexId,U)])((VertexId,VD,Option[U])⇒VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED]
-[Graph.aggregateMessages]: api/scala/index.html#org.apache.spark.graphx.Graph@aggregateMessages[A]((EdgeContext[VD,ED,A])⇒Unit,(A,A)⇒A,TripletFields)(ClassTag[A]):VertexRDD[A]
-[EdgeContext]: api/scala/index.html#org.apache.spark.graphx.EdgeContext
-[GraphOps.collectNeighborIds]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]]
-[GraphOps.collectNeighbors]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]]
+[EdgeRDD]: api/scala/org/apache/spark/graphx/EdgeRDD.html
+[VertexRDD]: api/scala/org/apache/spark/graphx/VertexRDD.html
+[Edge]: api/scala/org/apache/spark/graphx/Edge.html
+[EdgeTriplet]: api/scala/org/apache/spark/graphx/EdgeTriplet.html
+[Graph]: api/scala/org/apache/spark/graphx/Graph$.html
+[GraphOps]: api/scala/org/apache/spark/graphx/GraphOps.html
+[Graph.mapVertices]: api/scala/org/apache/spark/graphx/Graph.html#mapVertices[VD2]((VertexId,VD)⇒VD2)(ClassTag[VD2]):Graph[VD2,ED]
+[Graph.reverse]: api/scala/org/apache/spark/graphx/Graph.html#reverse:Graph[VD,ED]
+[Graph.subgraph]: api/scala/org/apache/spark/graphx/Graph.html#subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexId,VD)⇒Boolean):Graph[VD,ED]
+[Graph.mask]: api/scala/org/apache/spark/graphx/Graph.html#mask[VD2,ED2](Graph[VD2,ED2])(ClassTag[VD2],ClassTag[ED2]):Graph[VD,ED]
+[Graph.groupEdges]: api/scala/org/apache/spark/graphx/Graph.html#groupEdges((ED,ED)⇒ED):Graph[VD,ED]
+[GraphOps.joinVertices]: api/scala/org/apache/spark/graphx/GraphOps.html#joinVertices[U](RDD[(VertexId,U)])((VertexId,VD,U)⇒VD)(ClassTag[U]):Graph[VD,ED]
+[Graph.outerJoinVertices]: api/scala/org/apache/spark/graphx/Graph.html#outerJoinVertices[U,VD2](RDD[(VertexId,U)])((VertexId,VD,Option[U])⇒VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED]
+[Graph.aggregateMessages]: api/scala/org/apache/spark/graphx/Graph.html#aggregateMessages[A]((EdgeContext[VD,ED,A])⇒Unit,(A,A)⇒A,TripletFields)(ClassTag[A]):VertexRDD[A]
+[EdgeContext]: api/scala/org/apache/spark/graphx/EdgeContext.html
+[GraphOps.collectNeighborIds]: api/scala/org/apache/spark/graphx/GraphOps.html#collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]]
+[GraphOps.collectNeighbors]: api/scala/org/apache/spark/graphx/GraphOps.html#collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]]
 [RDD Persistence]: rdd-programming-guide.html#rdd-persistence
-[Graph.cache]: api/scala/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED]
-[GraphOps.pregel]: api/scala/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)⇒VD,(EdgeTriplet[VD,ED])⇒Iterator[(VertexId,A)],(A,A)⇒A)(ClassTag[A]):Graph[VD,ED]
-[PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy$
-[GraphLoader.edgeListFile]: api/scala/index.html#org.apache.spark.graphx.GraphLoader$@edgeListFile(SparkContext,String,Boolean,Int):Graph[Int,Int]
-[Graph.apply]: api/scala/index.html#org.apache.spark.graphx.Graph$@apply[VD,ED](RDD[(VertexId,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
-[Graph.fromEdgeTuples]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
-[Graph.fromEdges]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
-[PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy
-[PageRank]: api/scala/index.html#org.apache.spark.graphx.lib.PageRank$
-[ConnectedComponents]: api/scala/index.html#org.apache.spark.graphx.lib.ConnectedComponents$
-[TriangleCount]: api/scala/index.html#org.apache.spark.graphx.lib.TriangleCount$
-[Graph.partitionBy]: api/scala/index.html#org.apache.spark.graphx.Graph@partitionBy(PartitionStrategy):Graph[VD,ED]
-[EdgeContext.sendToSrc]: api/scala/index.html#org.apache.spark.graphx.EdgeContext@sendToSrc(msg:A):Unit
-[EdgeContext.sendToDst]: api/scala/index.html#org.apache.spark.graphx.EdgeContext@sendToDst(msg:A):Unit
+[Graph.cache]: api/scala/org/apache/spark/graphx/Graph.html#cache():Graph[VD,ED]
+[GraphOps.pregel]: api/scala/org/apache/spark/graphx/GraphOps.html#pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)⇒VD,(EdgeTriplet[VD,ED])⇒Iterator[(VertexId,A)],(A,A)⇒A)(ClassTag[A]):Graph[VD,ED]
+[PartitionStrategy]: api/scala/org/apache/spark/graphx/PartitionStrategy$.html
+[GraphLoader.edgeListFile]: api/scala/org/apache/spark/graphx/GraphLoader$.html#edgeListFile(SparkContext,String,Boolean,Int):Graph[Int,Int]
+[Graph.apply]: api/scala/org/apache/spark/graphx/Graph$.html#apply[VD,ED](RDD[(VertexId,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
+[Graph.fromEdgeTuples]: api/scala/org/apache/spark/graphx/Graph$.html#fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
+[Graph.fromEdges]: api/scala/org/apache/spark/graphx/Graph$.html#fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
+[PartitionStrategy]: api/scala/org/apache/spark/graphx/PartitionStrategy$.html
+[PageRank]: api/scala/org/apache/spark/graphx/lib/PageRank$.html
+[ConnectedComponents]: api/scala/org/apache/spark/graphx/lib/ConnectedComponents$.html
+[TriangleCount]: api/scala/org/apache/spark/graphx/lib/TriangleCount$.html
+[Graph.partitionBy]: api/scala/org/apache/spark/graphx/Graph.html#partitionBy(PartitionStrategy):Graph[VD,ED]
+[EdgeContext.sendToSrc]: api/scala/org/apache/spark/graphx/EdgeContext.html#sendToSrc(msg:A):Unit
+[EdgeContext.sendToDst]: api/scala/org/apache/spark/graphx/EdgeContext.html#sendToDst(msg:A):Unit
 [TripletFields]: api/java/org/apache/spark/graphx/TripletFields.html
 [TripletFields.All]: api/java/org/apache/spark/graphx/TripletFields.html#All
 [TripletFields.None]: api/java/org/apache/spark/graphx/TripletFields.html#None
@@ -74,7 +74,7 @@ license: |
 # Overview
 
 GraphX is a new component in Spark for graphs and graph-parallel computation. At a high level,
-GraphX extends the Spark [RDD](api/scala/index.html#org.apache.spark.rdd.RDD) by introducing a
+GraphX extends the Spark [RDD](api/scala/org/apache/spark/rdd/RDD.html) by introducing a
 new [Graph](#property_graph) abstraction: a directed multigraph with properties
 attached to each vertex and edge.  To support graph computation, GraphX exposes a set of fundamental
 operators (e.g., [subgraph](#structural_operators), [joinVertices](#join_operators), and
@@ -99,7 +99,7 @@ getting started with Spark refer to the [Spark Quick Start Guide](quick-start.ht
 
 # The Property Graph
 
-The [property graph](api/scala/index.html#org.apache.spark.graphx.Graph) is a directed multigraph
+The [property graph](api/scala/org/apache/spark/graphx/Graph.html) is a directed multigraph
 with user defined objects attached to each vertex and edge.  A directed multigraph is a directed
 graph with potentially multiple parallel edges sharing the same source and destination vertex.  The
 ability to support parallel edges simplifies modeling scenarios where there can be multiple
@@ -175,7 +175,7 @@ val userGraph: Graph[(String, String), String]
 There are numerous ways to construct a property graph from raw files, RDDs, and even synthetic
 generators and these are discussed in more detail in the section on
 [graph builders](#graph_builders).  Probably the most general method is to use the
-[Graph object](api/scala/index.html#org.apache.spark.graphx.Graph$).  For example the following
+[Graph object](api/scala/org/apache/spark/graphx/Graph$.html).  For example the following
 code constructs a graph from a collection of RDDs:
 
 {% highlight scala %}
diff --git a/docs/index.md b/docs/index.md
index f6ec595231f9e..38f12dd4db77b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -118,7 +118,7 @@ options for deployment:
 
 **API Docs:**
 
-* [Spark Scala API (Scaladoc)](api/scala/index.html#org.apache.spark.package)
+* [Spark Scala API (Scaladoc)](api/scala/org/apache/spark/index.html)
 * [Spark Java API (Javadoc)](api/java/index.html)
 * [Spark Python API (Sphinx)](api/python/index.html)
 * [Spark R API (Roxygen2)](api/R/index.html)
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 5787fe914ce7a..0e19bca92d19d 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -55,10 +55,10 @@ other first-order optimizations.
 Quasi-Newton](https://www.microsoft.com/en-us/research/wp-content/uploads/2007/01/andrew07scalable.pdf)
 (OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic net regularization.
 
-L-BFGS is used as a solver for [LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression),
-[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression),
-[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression)
-and [MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier).
+L-BFGS is used as a solver for [LinearRegression](api/scala/org/apache/spark/ml/regression/LinearRegression.html),
+[LogisticRegression](api/scala/org/apache/spark/ml/classification/LogisticRegression.html),
+[AFTSurvivalRegression](api/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.html)
+and [MultilayerPerceptronClassifier](api/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.html).
 
 MLlib L-BFGS solver calls the corresponding implementation in [breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala).
 
@@ -108,4 +108,4 @@ It solves certain optimization problems iteratively through the following proced
 
 Since it involves solving a weighted least squares (WLS) problem by `WeightedLeastSquares` in each iteration,
 it also requires the number of features to be no more than 4096.
-Currently IRLS is used as the default solver of [GeneralizedLinearRegression](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression).
+Currently IRLS is used as the default solver of [GeneralizedLinearRegression](api/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.html).
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 630a15d8535f4..9d5388005e587 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -71,7 +71,7 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 
 <div data-lang="scala" markdown="1">
 
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression).
+More details on parameters can be found in the [Scala API documentation](api/scala/org/apache/spark/ml/classification/LogisticRegression.html).
 
 {% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %}
 </div>
@@ -109,12 +109,12 @@ only available on the driver.
 
 <div data-lang="scala" markdown="1">
 
-[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary)
+[`LogisticRegressionTrainingSummary`](api/scala/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html)
 provides a summary for a
-[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel).
+[`LogisticRegressionModel`](api/scala/org/apache/spark/ml/classification/LogisticRegressionModel.html).
 In the case of binary classification, certain additional metrics are
 available, e.g. ROC curve. The binary summary can be accessed via the
-`binarySummary` method. See [`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
+`binarySummary` method. See [`BinaryLogisticRegressionTrainingSummary`](api/scala/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html).
 
 Continuing the earlier example:
 
@@ -216,7 +216,7 @@ We use two feature transformers to prepare the data; these help index categories
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier).
+More details on parameters can be found in the [Scala API documentation](api/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.html).
 
 {% include_example scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala %}
 
@@ -261,7 +261,7 @@ We use two feature transformers to prepare the data; these help index categories
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/RandomForestClassifier.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala %}
 </div>
@@ -302,7 +302,7 @@ We use two feature transformers to prepare the data; these help index categories
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.GBTClassifier) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/GBTClassifier.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala %}
 </div>
@@ -358,7 +358,7 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala %}
 </div>
@@ -403,7 +403,7 @@ in Spark ML supports binary classification with linear SVM. Internally, it optim
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.LinearSVC) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/LinearSVC.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/LinearSVCExample.scala %}
 </div>
@@ -447,7 +447,7 @@ The example below demonstrates how to load the
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.OneVsRest) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/OneVsRest.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/OneVsRestExample.scala %}
 </div>
@@ -501,7 +501,7 @@ setting the parameter $\lambda$ (default to $1.0$).
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.NaiveBayes) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/NaiveBayes.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/NaiveBayesExample.scala %}
 </div>
@@ -544,7 +544,7 @@ We scale features to be between 0 and 1 to prevent the exploding gradient proble
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.FMClassifier) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/classification/FMClassifier.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/FMClassifierExample.scala %}
 </div>
@@ -585,7 +585,7 @@ regression model and extracting model summary statistics.
 
 <div data-lang="scala" markdown="1">
 
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression).
+More details on parameters can be found in the [Scala API documentation](api/scala/org/apache/spark/ml/regression/LinearRegression.html).
 
 {% include_example scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala %}
 </div>
@@ -726,7 +726,7 @@ function and extracting model summary statistics.
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GeneralizedLinearRegressionExample.scala %}
 </div>
@@ -768,7 +768,7 @@ We use a feature transformer to index categorical features, adding metadata to t
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.DecisionTreeRegressor).
+More details on parameters can be found in the [Scala API documentation](api/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.html).
 
 {% include_example scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala %}
 </div>
@@ -810,7 +810,7 @@ We use a feature transformer to index categorical features, adding metadata to t
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.RandomForestRegressor) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/regression/RandomForestRegressor.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala %}
 </div>
@@ -851,7 +851,7 @@ be true in general.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GBTRegressor) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/regression/GBTRegressor.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala %}
 </div>
@@ -945,7 +945,7 @@ The implementation matches the result from R's survival function
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala %}
 </div>
@@ -1025,7 +1025,7 @@ is treated as piecewise linear function. The rules for prediction therefore are:
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.ml.regression.IsotonicRegression) for details on the API.
+Refer to the [`IsotonicRegression` Scala docs](api/scala/org/apache/spark/ml/regression/IsotonicRegression.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala %}
 </div>
@@ -1066,7 +1066,7 @@ We scale features to be between 0 and 1 to prevent the exploding gradient proble
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.FMRegressor) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/regression/FMRegressor.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/FMRegressorExample.scala %}
 </div>
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 2775d0421ccca..4574567fa9d50 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -85,7 +85,7 @@ called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.KMeans) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/clustering/KMeans.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/KMeansExample.scala %}
 </div>
@@ -123,7 +123,7 @@ and generates a `LDAModel` as the base model. Expert users may cast a `LDAModel`
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/clustering/LDA.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
 </div>
@@ -166,7 +166,7 @@ Bisecting K-means can often be much faster than regular K-means, but it will gen
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.BisectingKMeans) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/clustering/BisectingKMeans.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala %}
 </div>
@@ -255,7 +255,7 @@ model.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/clustering/GaussianMixture.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %}
 </div>
@@ -302,7 +302,7 @@ using truncated power iteration on a normalized pair-wise similarity matrix of t
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.PowerIterationClustering) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/clustering/PowerIterationClustering.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/PowerIterationClusteringExample.scala %}
 </div>
diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md
index e6e596bed110b..6c41efd5cc306 100644
--- a/docs/ml-collaborative-filtering.md
+++ b/docs/ml-collaborative-filtering.md
@@ -115,7 +115,7 @@ explicit (`implicitPrefs` is `false`).
 We evaluate the recommendation model by measuring the root-mean-square error of
 rating prediction.
 
-Refer to the [`ALS` Scala docs](api/scala/index.html#org.apache.spark.ml.recommendation.ALS)
+Refer to the [`ALS` Scala docs](api/scala/org/apache/spark/ml/recommendation/ALS.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ALSExample.scala %}
diff --git a/docs/ml-datasource.md b/docs/ml-datasource.md
index 5dc2d057a9163..0f2f5f482ec50 100644
--- a/docs/ml-datasource.md
+++ b/docs/ml-datasource.md
@@ -42,7 +42,7 @@ The schema of the `image` column is:
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`ImageDataSource`](api/scala/index.html#org.apache.spark.ml.source.image.ImageDataSource)
+[`ImageDataSource`](api/scala/org/apache/spark/ml/source/image/ImageDataSource.html)
 implements a Spark SQL data source API for loading image data as a DataFrame.
 
 {% highlight scala %}
@@ -133,7 +133,7 @@ The schemas of the columns are:
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`LibSVMDataSource`](api/scala/index.html#org.apache.spark.ml.source.libsvm.LibSVMDataSource)
+[`LibSVMDataSource`](api/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.html)
 implements a Spark SQL data source API for loading `LIBSVM` data as a DataFrame.
 
 {% highlight scala %}
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 92d2f3d0b418d..9c05fd5fa1ce2 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -96,8 +96,8 @@ when using text as features.  Our feature vectors could then be passed to a lear
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
-the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
+Refer to the [HashingTF Scala docs](api/scala/org/apache/spark/ml/feature/HashingTF.html) and
+the [IDF Scala docs](api/scala/org/apache/spark/ml/feature/IDF.html) for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
 </div>
@@ -135,7 +135,7 @@ In the following code segment, we start with a set of documents, each of which i
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
+Refer to the [Word2Vec Scala docs](api/scala/org/apache/spark/ml/feature/Word2Vec.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
@@ -200,8 +200,8 @@ Each vector represents the token counts of the document over the vocabulary.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer)
-and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
+Refer to the [CountVectorizer Scala docs](api/scala/org/apache/spark/ml/feature/CountVectorizer.html)
+and the [CountVectorizerModel Scala docs](api/scala/org/apache/spark/ml/feature/CountVectorizerModel.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
@@ -286,7 +286,7 @@ The resulting feature vectors could then be passed to a learning algorithm.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [FeatureHasher Scala docs](api/scala/index.html#org.apache.spark.ml.feature.FeatureHasher)
+Refer to the [FeatureHasher Scala docs](api/scala/org/apache/spark/ml/feature/FeatureHasher.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/FeatureHasherExample.scala %}
@@ -313,9 +313,9 @@ for more details on the API.
 
 ## Tokenizer
 
-[Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words).  A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality.  The example below shows how to split sentences into sequences of words.
+[Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words).  A simple [Tokenizer](api/scala/org/apache/spark/ml/feature/Tokenizer.html) class provides this functionality.  The example below shows how to split sentences into sequences of words.
 
-[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more
+[RegexTokenizer](api/scala/org/apache/spark/ml/feature/RegexTokenizer.html) allows more
  advanced tokenization based on regular expression (regex) matching.
  By default, the parameter "pattern" (regex, default: `"\\s+"`) is used as delimiters to split the input text.
  Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes
@@ -326,8 +326,8 @@ for more details on the API.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
-and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer)
+Refer to the [Tokenizer Scala docs](api/scala/org/apache/spark/ml/feature/Tokenizer.html)
+and the [RegexTokenizer Scala docs](api/scala/org/apache/spark/ml/feature/RegexTokenizer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/TokenizerExample.scala %}
@@ -395,7 +395,7 @@ filtered out.
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
+Refer to the [StopWordsRemover Scala docs](api/scala/org/apache/spark/ml/feature/StopWordsRemover.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala %}
@@ -430,7 +430,7 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
+Refer to the [NGram Scala docs](api/scala/org/apache/spark/ml/feature/NGram.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/NGramExample.scala %}
@@ -468,7 +468,7 @@ for `inputCol`.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
+Refer to the [Binarizer Scala docs](api/scala/org/apache/spark/ml/feature/Binarizer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/BinarizerExample.scala %}
@@ -493,14 +493,14 @@ for more details on the API.
 
 ## PCA
 
-[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components.
+[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/org/apache/spark/ml/feature/PCA.html) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components.
 
 **Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
+Refer to the [PCA Scala docs](api/scala/org/apache/spark/ml/feature/PCA.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/PCAExample.scala %}
@@ -525,14 +525,14 @@ for more details on the API.
 
 ## PolynomialExpansion
 
-[Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) class provides this functionality.  The example below shows how to expand your features into a 3-degree polynomial space.
+[Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/org/apache/spark/ml/feature/PolynomialExpansion.html) class provides this functionality.  The example below shows how to expand your features into a 3-degree polynomial space.
 
 **Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
+Refer to the [PolynomialExpansion Scala docs](api/scala/org/apache/spark/ml/feature/PolynomialExpansion.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala %}
@@ -561,7 +561,7 @@ The [Discrete Cosine
 Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform)
 transforms a length $N$ real-valued sequence in the time domain into
 another length $N$ real-valued sequence in the frequency domain. A
-[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class
+[DCT](api/scala/org/apache/spark/ml/feature/DCT.html) class
 provides this functionality, implementing the
 [DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II)
 and scaling the result by $1/\sqrt{2}$ such that the representing matrix
@@ -574,7 +574,7 @@ $0$th DCT coefficient and _not_ the $N/2$th).
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
+Refer to the [DCT Scala docs](api/scala/org/apache/spark/ml/feature/DCT.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/DCTExample.scala %}
@@ -704,7 +704,7 @@ Notice that the rows containing "d" or "e" are mapped to index "3.0"
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
+Refer to the [StringIndexer Scala docs](api/scala/org/apache/spark/ml/feature/StringIndexer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/StringIndexerExample.scala %}
@@ -770,7 +770,7 @@ labels (they will be inferred from the columns' metadata):
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [IndexToString Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IndexToString)
+Refer to the [IndexToString Scala docs](api/scala/org/apache/spark/ml/feature/IndexToString.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/IndexToStringExample.scala %}
@@ -809,7 +809,7 @@ for more details on the API.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) for more details on the API.
+Refer to the [OneHotEncoder Scala docs](api/scala/org/apache/spark/ml/feature/OneHotEncoder.html) for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
 </div>
@@ -835,7 +835,7 @@ Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.f
 `VectorIndexer` helps index categorical features in datasets of `Vector`s.
 It can both automatically decide which features are categorical and convert original values to category indices.  Specifically, it does the following:
 
-1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.ml.linalg.Vector) and a parameter `maxCategories`.
+1. Take an input column of type [Vector](api/scala/org/apache/spark/ml/linalg/Vector.html) and a parameter `maxCategories`.
 2. Decide which features should be categorical based on the number of distinct values, where features with at most `maxCategories` are declared categorical.
 3. Compute 0-based category indices for each categorical feature.
 4. Index categorical features and transform original feature values to indices.
@@ -849,7 +849,7 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
+Refer to the [VectorIndexer Scala docs](api/scala/org/apache/spark/ml/feature/VectorIndexer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/VectorIndexerExample.scala %}
@@ -910,7 +910,7 @@ then `interactedCol` as the output column contains:
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Interaction Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Interaction)
+Refer to the [Interaction Scala docs](api/scala/org/apache/spark/ml/feature/Interaction.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/InteractionExample.scala %}
@@ -944,7 +944,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
+Refer to the [Normalizer Scala docs](api/scala/org/apache/spark/ml/feature/Normalizer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/NormalizerExample.scala %}
@@ -986,7 +986,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
+Refer to the [StandardScaler Scala docs](api/scala/org/apache/spark/ml/feature/StandardScaler.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/StandardScalerExample.scala %}
@@ -1030,7 +1030,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [RobustScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RobustScaler)
+Refer to the [RobustScaler Scala docs](api/scala/org/apache/spark/ml/feature/RobustScaler.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/RobustScalerExample.scala %}
@@ -1078,8 +1078,8 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler)
-and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
+Refer to the [MinMaxScaler Scala docs](api/scala/org/apache/spark/ml/feature/MinMaxScaler.html)
+and the [MinMaxScalerModel Scala docs](api/scala/org/apache/spark/ml/feature/MinMaxScalerModel.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala %}
@@ -1121,8 +1121,8 @@ The following example demonstrates how to load a dataset in libsvm format and th
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [MaxAbsScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MaxAbsScaler)
-and the [MaxAbsScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MaxAbsScalerModel)
+Refer to the [MaxAbsScaler Scala docs](api/scala/org/apache/spark/ml/feature/MaxAbsScaler.html)
+and the [MaxAbsScalerModel Scala docs](api/scala/org/apache/spark/ml/feature/MaxAbsScalerModel.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala %}
@@ -1157,7 +1157,7 @@ Note that if you have no idea of the upper and lower bounds of the targeted colu
 
 Note also that the splits that you provided have to be in strictly increasing order, i.e. `s0 < s1 < s2 < ... < sn`.
 
-More details can be found in the API docs for [Bucketizer](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer).
+More details can be found in the API docs for [Bucketizer](api/scala/org/apache/spark/ml/feature/Bucketizer.html).
 
 **Examples**
 
@@ -1166,7 +1166,7 @@ The following example demonstrates how to bucketize a column of `Double`s into a
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
+Refer to the [Bucketizer Scala docs](api/scala/org/apache/spark/ml/feature/Bucketizer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/BucketizerExample.scala %}
@@ -1216,7 +1216,7 @@ This example below demonstrates how to transform vectors using a transforming ve
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
+Refer to the [ElementwiseProduct Scala docs](api/scala/org/apache/spark/ml/feature/ElementwiseProduct.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala %}
@@ -1276,7 +1276,7 @@ This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2)
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
+Refer to the [SQLTransformer Scala docs](api/scala/org/apache/spark/ml/feature/SQLTransformer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
@@ -1336,7 +1336,7 @@ output column to `features`, after transformation we should get the following Da
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
+Refer to the [VectorAssembler Scala docs](api/scala/org/apache/spark/ml/feature/VectorAssembler.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala %}
@@ -1387,7 +1387,7 @@ to avoid this kind of inconsistent state.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+Refer to the [VectorSizeHint Scala docs](api/scala/org/apache/spark/ml/feature/VectorSizeHint.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
@@ -1426,7 +1426,7 @@ NaN values, they will be handled specially and placed into their own bucket, for
 are used, then non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].
 
 Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
-[approxQuantile](api/scala/index.html#org.apache.spark.sql.DataFrameStatFunctions) for a
+[approxQuantile](api/scala/org/apache/spark/sql/DataFrameStatFunctions.html) for a
 detailed description). The precision of the approximation can be controlled with the
 `relativeError` parameter. When set to zero, exact quantiles are calculated
 (**Note:** Computing exact quantiles is an expensive operation). The lower and upper bin bounds
@@ -1470,7 +1470,7 @@ a categorical one. Given `numBuckets = 3`, we should get the following DataFrame
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [QuantileDiscretizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.QuantileDiscretizer)
+Refer to the [QuantileDiscretizer Scala docs](api/scala/org/apache/spark/ml/feature/QuantileDiscretizer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala %}
@@ -1539,7 +1539,7 @@ the relevant column.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [Imputer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Imputer)
+Refer to the [Imputer Scala docs](api/scala/org/apache/spark/ml/feature/Imputer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ImputerExample.scala %}
@@ -1620,7 +1620,7 @@ Suppose also that we have potential input attributes for the `userFeatures`, i.e
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
+Refer to the [VectorSlicer Scala docs](api/scala/org/apache/spark/ml/feature/VectorSlicer.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/VectorSlicerExample.scala %}
@@ -1706,7 +1706,7 @@ id | country | hour | clicked | features         | label
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
+Refer to the [RFormula Scala docs](api/scala/org/apache/spark/ml/feature/RFormula.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/RFormulaExample.scala %}
@@ -1770,7 +1770,7 @@ id | features              | clicked | selectedFeatures
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [ChiSqSelector Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ChiSqSelector)
+Refer to the [ChiSqSelector Scala docs](api/scala/org/apache/spark/ml/feature/ChiSqSelector.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala %}
@@ -1856,7 +1856,7 @@ Bucketed Random Projection accepts arbitrary vectors as input features, and supp
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [BucketedRandomProjectionLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.BucketedRandomProjectionLSH)
+Refer to the [BucketedRandomProjectionLSH Scala docs](api/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala %}
@@ -1897,7 +1897,7 @@ The input sets for MinHash are represented as binary vectors, where the vector i
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [MinHashLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH)
+Refer to the [MinHashLSH Scala docs](api/scala/org/apache/spark/ml/feature/MinHashLSH.html)
 for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/MinHashLSHExample.scala %}
diff --git a/docs/ml-frequent-pattern-mining.md b/docs/ml-frequent-pattern-mining.md
index a243188603997..42d7e50357391 100644
--- a/docs/ml-frequent-pattern-mining.md
+++ b/docs/ml-frequent-pattern-mining.md
@@ -75,7 +75,7 @@ The `FPGrowthModel` provides:
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.fpm.FPGrowth) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/fpm/FPGrowth.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/FPGrowthExample.scala %}
 </div>
@@ -128,7 +128,7 @@ pattern mining problem.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.fpm.PrefixSpan) for more details.
+Refer to the [Scala API docs](api/scala/org/apache/spark/ml/fpm/PrefixSpan.html) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/PrefixSpanExample.scala %}
 </div>
diff --git a/docs/ml-migration-guide.md b/docs/ml-migration-guide.md
index 860c941e6b44b..f3cd762b6f2ac 100644
--- a/docs/ml-migration-guide.md
+++ b/docs/ml-migration-guide.md
@@ -187,7 +187,7 @@ val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML
 val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML
 {% endhighlight %}
 
-Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
+Refer to the [`MLUtils` Scala docs](api/scala/org/apache/spark/mllib/util/MLUtils$.html) for further detail.
 </div>
 
 <div data-lang="java" markdown="1">
@@ -341,9 +341,9 @@ In the `spark.ml` package, there exists one breaking API change and one behavior
 In the `spark.mllib` package, there were several breaking changes, but all in `DeveloperApi` or `Experimental` APIs:
 
 * Gradient-Boosted Trees
-    * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/index.html#org.apache.spark.mllib.tree.loss.Loss) method was changed.  This is only an issues for users who wrote their own losses for GBTs.
-    * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.BoostingStrategy) have been changed because of a modification to the case class fields.  This could be an issue for users who use `BoostingStrategy` to set GBT parameters.
-* *(Breaking change)* The return value of [`LDA.run`](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) has changed.  It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`.  The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm.
+    * *(Breaking change)* The signature of the [`Loss.gradient`](api/scala/org/apache/spark/mllib/tree/loss/Loss.html) method was changed.  This is only an issues for users who wrote their own losses for GBTs.
+    * *(Breaking change)* The `apply` and `copy` methods for the case class [`BoostingStrategy`](api/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.html) have been changed because of a modification to the case class fields.  This could be an issue for users who use `BoostingStrategy` to set GBT parameters.
+* *(Breaking change)* The return value of [`LDA.run`](api/scala/org/apache/spark/mllib/clustering/LDA.html) has changed.  It now returns an abstract class `LDAModel` instead of the concrete class `DistributedLDAModel`.  The object of type `LDAModel` can still be cast to the appropriate concrete type, which depends on the optimization algorithm.
 
 In the `spark.ml` package, several major API changes occurred, including:
 
@@ -359,12 +359,12 @@ changes for future releases.
 
 In the `spark.mllib` package, there were several breaking changes.  The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental.
 
-* *(Breaking change)* In [`ALS`](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS), the extraneous method `solveLeastSquares` has been removed.  The `DeveloperApi` method `analyzeBlocks` was also removed.
-* *(Breaking change)* [`StandardScalerModel`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScalerModel) remains an Alpha component. In it, the `variance` method has been replaced with the `std` method.  To compute the column variance values returned by the original `variance` method, simply square the standard deviation values returned by `std`.
-* *(Breaking change)* [`StreamingLinearRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD) remains an Experimental component.  In it, there were two changes:
+* *(Breaking change)* In [`ALS`](api/scala/org/apache/spark/mllib/recommendation/ALS.html), the extraneous method `solveLeastSquares` has been removed.  The `DeveloperApi` method `analyzeBlocks` was also removed.
+* *(Breaking change)* [`StandardScalerModel`](api/scala/org/apache/spark/mllib/feature/StandardScalerModel.html) remains an Alpha component. In it, the `variance` method has been replaced with the `std` method.  To compute the column variance values returned by the original `variance` method, simply square the standard deviation values returned by `std`.
+* *(Breaking change)* [`StreamingLinearRegressionWithSGD`](api/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.html) remains an Experimental component.  In it, there were two changes:
     * The constructor taking arguments was removed in favor of a builder pattern using the default constructor plus parameter setter methods.
     * Variable `model` is no longer public.
-* *(Breaking change)* [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) remains an Experimental component.  In it and its associated classes, there were several changes:
+* *(Breaking change)* [`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html) remains an Experimental component.  In it and its associated classes, there were several changes:
     * In `DecisionTree`, the deprecated class method `train` has been removed.  (The object/static `train` methods remain.)
     * In `Strategy`, the `checkpointDir` parameter has been removed.  Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training.
 * `PythonMLlibAPI` (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared `private[python]`.  This was never meant for external use.
@@ -373,31 +373,31 @@ In the `spark.mllib` package, there were several breaking changes.  The first ch
 
 In the `spark.ml` package, the main API changes are from Spark SQL.  We list the most important changes here:
 
-* The old [SchemaRDD](https://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API.  All algorithms in `spark.ml` which used to use SchemaRDD now use DataFrame.
+* The old [SchemaRDD](https://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/org/apache/spark/sql/DataFrame.html) with a somewhat modified API.  All algorithms in `spark.ml` which used to use SchemaRDD now use DataFrame.
 * In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`.  These implicits have been moved, so we now call `import sqlContext.implicits._`.
 * Java APIs for SQL have also changed accordingly.  Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details.
 
 Other changes were in `LogisticRegression`:
 
 * The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability").  The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future).
-* In Spark 1.2, `LogisticRegressionModel` did not include an intercept.  In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS).  The option to use an intercept will be added in the future.
+* In Spark 1.2, `LogisticRegressionModel` did not include an intercept.  In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html).  The option to use an intercept will be added in the future.
 
 ## Upgrading from MLlib 1.1 to 1.2
 
 The only API changes in MLlib v1.2 are in
-[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+[`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html),
 which continues to be an experimental API in MLlib 1.2:
 
 1. *(Breaking change)* The Scala API for classification takes a named argument specifying the number
 of classes.  In MLlib v1.1, this argument was called `numClasses` in Python and
 `numClassesForClassification` in Scala.  In MLlib v1.2, the names are both set to `numClasses`.
 This `numClasses` parameter is specified either via
-[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
-or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
+[`Strategy`](api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html)
+or via [`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html)
 static `trainClassifier` and `trainRegressor` methods.
 
 2. *(Breaking change)* The API for
-[`Node`](api/scala/index.html#org.apache.spark.mllib.tree.model.Node) has changed.
+[`Node`](api/scala/org/apache/spark/mllib/tree/model/Node.html) has changed.
 This should generally not affect user code, unless the user manually constructs decision trees
 (instead of using the `trainClassifier` or `trainRegressor` methods).
 The tree `Node` now includes more information, including the probability of the predicted label
@@ -411,7 +411,7 @@ Examples in the Spark distribution and examples in the
 ## Upgrading from MLlib 1.0 to 1.1
 
 The only API changes in MLlib v1.1 are in
-[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+[`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html),
 which continues to be an experimental API in MLlib 1.1:
 
 1. *(Breaking change)* The meaning of tree depth has been changed by 1 in order to match
@@ -421,12 +421,12 @@ and in [rpart](http://cran.r-project.org/web/packages/rpart/index.html).
 In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes.
 In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes.
 This depth is specified by the `maxDepth` parameter in
-[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
-or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
+[`Strategy`](api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html)
+or via [`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html)
 static `trainClassifier` and `trainRegressor` methods.
 
 2. *(Non-breaking change)* We recommend using the newly added `trainClassifier` and `trainRegressor`
-methods to build a [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+methods to build a [`DecisionTree`](api/scala/org/apache/spark/mllib/tree/DecisionTree.html),
 rather than using the old parameter class `Strategy`.  These new training methods explicitly
 separate classification and regression, and they replace specialized parameter types with
 simple `String` types.
diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md
index 993a428ab5489..0b581e1a09c97 100644
--- a/docs/ml-pipeline.md
+++ b/docs/ml-pipeline.md
@@ -238,7 +238,7 @@ notes, then it should be treated as a bug to be fixed.
 
 This section gives code examples illustrating the functionality discussed above.
 For more info, please refer to the API documentation
-([Scala](api/scala/index.html#org.apache.spark.ml.package),
+([Scala](api/scala/org/apache/spark/ml/package.html),
 [Java](api/java/org/apache/spark/ml/package-summary.html),
 and [Python](api/python/pyspark.ml.html)).
 
@@ -250,9 +250,9 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [`Estimator` Scala docs](api/scala/index.html#org.apache.spark.ml.Estimator),
-the [`Transformer` Scala docs](api/scala/index.html#org.apache.spark.ml.Transformer) and
-the [`Params` Scala docs](api/scala/index.html#org.apache.spark.ml.param.Params) for details on the API.
+Refer to the [`Estimator` Scala docs](api/scala/org/apache/spark/ml/Estimator.html),
+the [`Transformer` Scala docs](api/scala/org/apache/spark/ml/Transformer.html) and
+the [`Params` Scala docs](api/scala/org/apache/spark/ml/param/Params.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala %}
 </div>
@@ -285,7 +285,7 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [`Pipeline` Scala docs](api/scala/index.html#org.apache.spark.ml.Pipeline) for details on the API.
+Refer to the [`Pipeline` Scala docs](api/scala/org/apache/spark/ml/Pipeline.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/PipelineExample.scala %}
 </div>
diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index 997c57244b2ef..a3d57ff7d266b 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -50,7 +50,7 @@ correlation methods are currently Pearson's and Spearman's correlation.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Correlation`](api/scala/index.html#org.apache.spark.ml.stat.Correlation$)
+[`Correlation`](api/scala/org/apache/spark/ml/stat/Correlation$.html)
 computes the correlation matrix for the input Dataset of Vectors using the specified method.
 The output will be a DataFrame that contains the correlation matrix of the column of vectors.
 
@@ -87,7 +87,7 @@ the Chi-squared statistic is computed. All label and feature values must be cate
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`ChiSquareTest` Scala docs](api/scala/index.html#org.apache.spark.ml.stat.ChiSquareTest$) for details on the API.
+Refer to the [`ChiSquareTest` Scala docs](api/scala/org/apache/spark/ml/stat/ChiSquareTest$.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala %}
 </div>
@@ -114,7 +114,7 @@ as well as the total count.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
+The following example demonstrates using [`Summarizer`](api/scala/org/apache/spark/ml/stat/Summarizer$.html)
 to compute the mean and variance for a vector column of the input dataframe, with and without a weight column.
 
 {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
@@ -133,4 +133,4 @@ Refer to the [`Summarizer` Python docs](api/python/index.html#pyspark.ml.stat.Su
 {% include_example python/ml/summarizer_example.py %}
 </div>
 
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md
index 0717cce538bf3..49e23684e5974 100644
--- a/docs/ml-tuning.md
+++ b/docs/ml-tuning.md
@@ -49,12 +49,12 @@ Built-in Cross-Validation and other tooling allow users to optimize hyperparamet
 An important task in ML is *model selection*, or using data to find the best model or parameters for a given task.  This is also called *tuning*.
 Tuning may be done for individual `Estimator`s such as `LogisticRegression`, or for entire `Pipeline`s which include multiple algorithms, featurization, and other steps.  Users can tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately.
 
-MLlib supports model selection using tools such as [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) and [`TrainValidationSplit`](api/scala/index.html#org.apache.spark.ml.tuning.TrainValidationSplit).
+MLlib supports model selection using tools such as [`CrossValidator`](api/scala/org/apache/spark/ml/tuning/CrossValidator.html) and [`TrainValidationSplit`](api/scala/org/apache/spark/ml/tuning/TrainValidationSplit.html).
 These tools require the following items:
 
-* [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator): algorithm or `Pipeline` to tune
+* [`Estimator`](api/scala/org/apache/spark/ml/Estimator.html): algorithm or `Pipeline` to tune
 * Set of `ParamMap`s: parameters to choose from, sometimes called a "parameter grid" to search over
-* [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator): metric to measure how well a fitted `Model` does on held-out test data
+* [`Evaluator`](api/scala/org/apache/spark/ml/evaluation/Evaluator.html): metric to measure how well a fitted `Model` does on held-out test data
 
 At a high level, these model selection tools work as follows:
 
@@ -63,13 +63,13 @@ At a high level, these model selection tools work as follows:
   * For each `ParamMap`, they fit the `Estimator` using those parameters, get the fitted `Model`, and evaluate the `Model`'s performance using the `Evaluator`.
 * They select the `Model` produced by the best-performing set of parameters.
 
-The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator)
-for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
-for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator)
+The `Evaluator` can be a [`RegressionEvaluator`](api/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.html)
+for regression problems, a [`BinaryClassificationEvaluator`](api/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.html)
+for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.html)
 for multiclass problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName`
 method in each of these evaluators.
 
-To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) utility.
+To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/org/apache/spark/ml/tuning/ParamGridBuilder.html) utility.
 By default, sets of parameters from the parameter grid are evaluated in serial. Parameter evaluation can be done in parallel by setting `parallelism` with a value of 2 or more (a value of 1 will be serial) before running model selection with `CrossValidator` or `TrainValidationSplit`.
 The value of `parallelism` should be chosen carefully to maximize parallelism without exceeding cluster resources, and larger values may not always lead to improved performance.  Generally speaking, a value up to 10 should be sufficient for most clusters.
 
@@ -93,7 +93,7 @@ However, it is also a well-established method for choosing parameters which is m
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) for details on the API.
+Refer to the [`CrossValidator` Scala docs](api/scala/org/apache/spark/ml/tuning/CrossValidator.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala %}
 </div>
@@ -133,7 +133,7 @@ Like `CrossValidator`, `TrainValidationSplit` finally fits the `Estimator` using
 
 <div data-lang="scala" markdown="1">
 
-Refer to the [`TrainValidationSplit` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.TrainValidationSplit) for details on the API.
+Refer to the [`TrainValidationSplit` Scala docs](api/scala/org/apache/spark/ml/tuning/TrainValidationSplit.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala %}
 </div>
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 12c33a5e38049..4cb2e259ccfbc 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -55,12 +55,12 @@ initialization via k-means\|\|.
 The following code snippets can be executed in `spark-shell`.
 
 In the following example after loading and parsing data, we use the
-[`KMeans`](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) object to cluster the data
+[`KMeans`](api/scala/org/apache/spark/mllib/clustering/KMeans.html) object to cluster the data
 into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
 Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact, the
 optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
 
-Refer to the [`KMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) and [`KMeansModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeansModel) for details on the API.
+Refer to the [`KMeans` Scala docs](api/scala/org/apache/spark/mllib/clustering/KMeans.html) and [`KMeansModel` Scala docs](api/scala/org/apache/spark/mllib/clustering/KMeansModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/KMeansExample.scala %}
 </div>
@@ -111,11 +111,11 @@ has the following parameters:
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 In the following example after loading and parsing data, we use a
-[GaussianMixture](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture)
+[GaussianMixture](api/scala/org/apache/spark/mllib/clustering/GaussianMixture.html)
 object to cluster the data into two clusters. The number of desired clusters is passed
 to the algorithm. We then output the parameters of the mixture model.
 
-Refer to the [`GaussianMixture` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture) and [`GaussianMixtureModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixtureModel) for details on the API.
+Refer to the [`GaussianMixture` Scala docs](api/scala/org/apache/spark/mllib/clustering/GaussianMixture.html) and [`GaussianMixtureModel` Scala docs](api/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/GaussianMixtureExample.scala %}
 </div>
@@ -172,15 +172,15 @@ In the following, we show code snippets to demonstrate how to use PIC in `spark.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`PowerIterationClustering`](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClustering) 
+[`PowerIterationClustering`](api/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.html) 
 implements the PIC algorithm.
 It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
 affinity matrix.
 Calling `PowerIterationClustering.run` returns a
-[`PowerIterationClusteringModel`](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClusteringModel),
+[`PowerIterationClusteringModel`](api/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html),
 which contains the computed clustering assignments.
 
-Refer to the [`PowerIterationClustering` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClustering) and [`PowerIterationClusteringModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClusteringModel) for details on the API.
+Refer to the [`PowerIterationClustering` Scala docs](api/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.html) and [`PowerIterationClusteringModel` Scala docs](api/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala %}
 </div>
@@ -278,9 +278,9 @@ separately.
 **Expectation Maximization**
 
 Implemented in
-[`EMLDAOptimizer`](api/scala/index.html#org.apache.spark.mllib.clustering.EMLDAOptimizer)
+[`EMLDAOptimizer`](api/scala/org/apache/spark/mllib/clustering/EMLDAOptimizer.html)
 and
-[`DistributedLDAModel`](api/scala/index.html#org.apache.spark.mllib.clustering.DistributedLDAModel).
+[`DistributedLDAModel`](api/scala/org/apache/spark/mllib/clustering/DistributedLDAModel.html).
 
 For the parameters provided to `LDA`:
 
@@ -350,13 +350,13 @@ perplexity of the provided `documents` given the inferred topics.
 **Examples**
 
 In the following example, we load word count vectors representing a corpus of documents.
-We then use [LDA](api/scala/index.html#org.apache.spark.mllib.clustering.LDA)
+We then use [LDA](api/scala/org/apache/spark/mllib/clustering/LDA.html)
 to infer three topics from the documents. The number of desired clusters is passed
 to the algorithm. We then output the topics, represented as probability distributions over words.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`LDA` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.LDA) and [`DistributedLDAModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.DistributedLDAModel) for details on the API.
+Refer to the [`LDA` Scala docs](api/scala/org/apache/spark/mllib/clustering/LDA.html) and [`DistributedLDAModel` Scala docs](api/scala/org/apache/spark/mllib/clustering/DistributedLDAModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala %}
 </div>
@@ -398,7 +398,7 @@ The implementation in MLlib has the following parameters:
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`BisectingKMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.BisectingKMeans) and [`BisectingKMeansModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.BisectingKMeansModel) for details on the API.
+Refer to the [`BisectingKMeans` Scala docs](api/scala/org/apache/spark/mllib/clustering/BisectingKMeans.html) and [`BisectingKMeansModel` Scala docs](api/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala %}
 </div>
@@ -451,7 +451,7 @@ This example shows how to estimate clusters on streaming data.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`StreamingKMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.StreamingKMeans) for details on the API.
+Refer to the [`StreamingKMeans` Scala docs](api/scala/org/apache/spark/mllib/clustering/StreamingKMeans.html) for details on the API.
 And Refer to [Spark Streaming Programming Guide](streaming-programming-guide.html#initializing) for details on StreamingContext.
 
 {% include_example scala/org/apache/spark/examples/mllib/StreamingKMeansExample.scala %}
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 21546a63263f9..aaefa59c4a9c3 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -76,11 +76,11 @@ best parameter learned from a sampled subset to the full dataset and expect simi
 
 <div data-lang="scala" markdown="1">
 In the following example, we load rating data. Each row consists of a user, a product and a rating.
-We use the default [ALS.train()](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS$) 
+We use the default [ALS.train()](api/scala/org/apache/spark/mllib/recommendation/ALS$.html)
 method which assumes ratings are explicit. We evaluate the
 recommendation model by measuring the Mean Squared Error of rating prediction.
 
-Refer to the [`ALS` Scala docs](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS) for more details on the API.
+Refer to the [`ALS` Scala docs](api/scala/org/apache/spark/mllib/recommendation/ALS.html) for more details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/RecommendationExample.scala %}
 
diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index cdac46284b6be..6d3b1a599d48b 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -42,13 +42,13 @@ of the vector.
 <div data-lang="scala" markdown="1">
 
 The base class of local vectors is
-[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector), and we provide two
-implementations: [`DenseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseVector) and
-[`SparseVector`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseVector).  We recommend
+[`Vector`](api/scala/org/apache/spark/mllib/linalg/Vector.html), and we provide two
+implementations: [`DenseVector`](api/scala/org/apache/spark/mllib/linalg/DenseVector.html) and
+[`SparseVector`](api/scala/org/apache/spark/mllib/linalg/SparseVector.html).  We recommend
 using the factory methods implemented in
-[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) to create local vectors.
+[`Vectors`](api/scala/org/apache/spark/mllib/linalg/Vectors$.html) to create local vectors.
 
-Refer to the [`Vector` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) for details on the API.
+Refer to the [`Vector` Scala docs](api/scala/org/apache/spark/mllib/linalg/Vector.html) and [`Vectors` Scala docs](api/scala/org/apache/spark/mllib/linalg/Vectors$.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -138,9 +138,9 @@ For multiclass classification, labels should be class indices starting from zero
 <div data-lang="scala" markdown="1">
 
 A labeled point is represented by the case class
-[`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint).
+[`LabeledPoint`](api/scala/org/apache/spark/mllib/regression/LabeledPoint.html).
 
-Refer to the [`LabeledPoint` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) for details on the API.
+Refer to the [`LabeledPoint` Scala docs](api/scala/org/apache/spark/mllib/regression/LabeledPoint.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -211,10 +211,10 @@ After loading, the feature indices are converted to zero-based.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`MLUtils.loadLibSVMFile`](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) reads training
+[`MLUtils.loadLibSVMFile`](api/scala/org/apache/spark/mllib/util/MLUtils$.html) reads training
 examples stored in LIBSVM format.
 
-Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for details on the API.
+Refer to the [`MLUtils` Scala docs](api/scala/org/apache/spark/mllib/util/MLUtils$.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -272,14 +272,14 @@ is stored in a one-dimensional array `[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]` with the m
 <div data-lang="scala" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide two
-implementations: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix),
-and [`SparseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseMatrix).
+[`Matrix`](api/scala/org/apache/spark/mllib/linalg/Matrix.html), and we provide two
+implementations: [`DenseMatrix`](api/scala/org/apache/spark/mllib/linalg/DenseMatrix.html),
+and [`SparseMatrix`](api/scala/org/apache/spark/mllib/linalg/SparseMatrix.html).
 We recommend using the factory methods implemented
-in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) to create local
+in [`Matrices`](api/scala/org/apache/spark/mllib/linalg/Matrices$.html) to create local
 matrices. Remember, local matrices in MLlib are stored in column-major order.
 
-Refer to the [`Matrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix) and [`Matrices` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) for details on the API.
+Refer to the [`Matrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/Matrix.html) and [`Matrices` Scala docs](api/scala/org/apache/spark/mllib/linalg/Matrices$.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Matrix, Matrices}
@@ -377,12 +377,12 @@ limited by the integer range but it should be much smaller in practice.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-A [`RowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) can be
+A [`RowMatrix`](api/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) can be
 created from an `RDD[Vector]` instance.  Then we can compute its column summary statistics and decompositions.
 [QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) is of the form A = QR where Q is an orthogonal matrix and R is an upper triangular matrix.
 For [singular value decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition) and [principal component analysis (PCA)](https://en.wikipedia.org/wiki/Principal_component_analysis), please refer to [Dimensionality reduction](mllib-dimensionality-reduction.html).
 
-Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
+Refer to the [`RowMatrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vector
@@ -463,13 +463,13 @@ vector.
 <div data-lang="scala" markdown="1">
 
 An
-[`IndexedRowMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix)
+[`IndexedRowMatrix`](api/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html)
 can be created from an `RDD[IndexedRow]` instance, where
-[`IndexedRow`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRow) is a
+[`IndexedRow`](api/scala/org/apache/spark/mllib/linalg/distributed/IndexedRow.html) is a
 wrapper over `(Long, Vector)`.  An `IndexedRowMatrix` can be converted to a `RowMatrix` by dropping
 its row indices.
 
-Refer to the [`IndexedRowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) for details on the API.
+Refer to the [`IndexedRowMatrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
@@ -568,14 +568,14 @@ dimensions of the matrix are huge and the matrix is very sparse.
 <div data-lang="scala" markdown="1">
 
 A
-[`CoordinateMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix)
+[`CoordinateMatrix`](api/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html)
 can be created from an `RDD[MatrixEntry]` instance, where
-[`MatrixEntry`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.MatrixEntry) is a
+[`MatrixEntry`](api/scala/org/apache/spark/mllib/linalg/distributed/MatrixEntry.html) is a
 wrapper over `(Long, Long, Double)`.  A `CoordinateMatrix` can be converted to an `IndexedRowMatrix`
 with sparse rows by calling `toIndexedRowMatrix`.  Other computations for 
 `CoordinateMatrix` are not currently supported.
 
-Refer to the [`CoordinateMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.CoordinateMatrix) for details on the API.
+Refer to the [`CoordinateMatrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
@@ -678,12 +678,12 @@ the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
+A [`BlockMatrix`](api/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
 most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
 `toBlockMatrix` creates blocks of size 1024 x 1024 by default.
 Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
 
-Refer to the [`BlockMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) for details on the API.
+Refer to the [`BlockMatrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 045da744239b9..455649c8e686e 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -151,7 +151,7 @@ When tuning these parameters, be careful to validate on held-out test data to av
 
 * **`maxDepth`**: Maximum depth of a tree.  Deeper trees are more expressive (potentially allowing higher accuracy), but they are also more costly to train and are more likely to overfit.
 
-* **`minInstancesPerNode`**: For a node to be split further, each of its children must receive at least this number of training instances.  This is commonly used with [RandomForest](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$) since those are often trained deeper than individual trees.
+* **`minInstancesPerNode`**: For a node to be split further, each of its children must receive at least this number of training instances.  This is commonly used with [RandomForest](api/scala/org/apache/spark/mllib/tree/RandomForest$.html) since those are often trained deeper than individual trees.
 
 * **`minInfoGain`**: For a node to be split further, the split must improve at least this much (in terms of information gain).
 
@@ -167,13 +167,13 @@ These parameters may be tuned.  Be careful to validate on held-out test data whe
   * The default value is conservatively chosen to be 256 MiB to allow the decision algorithm to work in most scenarios.  Increasing `maxMemoryInMB` can lead to faster training (if the memory is available) by allowing fewer passes over the data.  However, there may be decreasing returns as `maxMemoryInMB` grows since the amount of communication on each iteration can be proportional to `maxMemoryInMB`.
   * *Implementation details*: For faster processing, the decision tree algorithm collects statistics about groups of nodes to split (rather than 1 node at a time).  The number of nodes which can be handled in one group is determined by the memory requirements (which vary per features).  The `maxMemoryInMB` parameter specifies the memory limit in terms of megabytes which each worker can use for these statistics.
 
-* **`subsamplingRate`**: Fraction of the training data used for learning the decision tree.  This parameter is most relevant for training ensembles of trees (using [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$) and [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees)), where it can be useful to subsample the original data.  For training a single decision tree, this parameter is less useful since the number of training instances is generally not the main constraint.
+* **`subsamplingRate`**: Fraction of the training data used for learning the decision tree.  This parameter is most relevant for training ensembles of trees (using [`RandomForest`](api/scala/org/apache/spark/mllib/tree/RandomForest$.html) and [`GradientBoostedTrees`](api/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.html)), where it can be useful to subsample the original data.  For training a single decision tree, this parameter is less useful since the number of training instances is generally not the main constraint.
 
 * **`impurity`**: Impurity measure (discussed above) used to choose between candidate splits.  This measure must match the `algo` parameter.
 
 ### Caching and checkpointing
 
-MLlib 1.2 adds several features for scaling up to larger (deeper) trees and tree ensembles.  When `maxDepth` is set to be large, it can be useful to turn on node ID caching and checkpointing.  These parameters are also useful for [RandomForest](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$) when `numTrees` is set to be large.
+MLlib 1.2 adds several features for scaling up to larger (deeper) trees and tree ensembles.  When `maxDepth` is set to be large, it can be useful to turn on node ID caching and checkpointing.  These parameters are also useful for [RandomForest](api/scala/org/apache/spark/mllib/tree/RandomForest$.html) when `numTrees` is set to be large.
 
 * **`useNodeIdCache`**: If this is set to true, the algorithm will avoid passing the current model (tree or trees) to executors on each iteration.
   * This can be useful with deep trees (speeding up computation on workers) and for large Random Forests (reducing communication on each iteration).
@@ -207,7 +207,7 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`DecisionTree` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) for details on the API.
+Refer to the [`DecisionTree` Scala docs](api/scala/org/apache/spark/mllib/tree/DecisionTree.html) and [`DecisionTreeModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala %}
 </div>
@@ -238,7 +238,7 @@ depth of 5. The Mean Squared Error (MSE) is computed at the end to evaluate
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`DecisionTree` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and [`DecisionTreeModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) for details on the API.
+Refer to the [`DecisionTree` Scala docs](api/scala/org/apache/spark/mllib/tree/DecisionTree.html) and [`DecisionTreeModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala %}
 </div>
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index 5eb36b4228ca3..8818e403aabbc 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -77,7 +77,7 @@ passes, $O(n)$ storage on each executor, and $O(n k)$ storage on the driver.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`SingularValueDecomposition` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.SingularValueDecomposition) for details on the API.
+Refer to the [`SingularValueDecomposition` Scala docs](api/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/SVDExample.scala %}
 
@@ -117,14 +117,14 @@ the rotation matrix are called principal components. PCA is used widely in dimen
 The following code demonstrates how to compute principal components on a `RowMatrix`
 and use them to project the vectors into a low-dimensional space.
 
-Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
+Refer to the [`RowMatrix` Scala docs](api/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala %}
 
 The following code demonstrates how to compute principal components on source vectors
 and use them to project the vectors into a low-dimensional space while keeping associated labels:
 
-Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API.
+Refer to the [`PCA` Scala docs](api/scala/org/apache/spark/mllib/feature/PCA.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/PCAOnSourceVectorExample.scala %}
 
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 6149f458214e6..27a9fe67f3f6d 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -24,7 +24,7 @@ license: |
 
 An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
 is a learning algorithm which creates a model composed of a set of other base models.
-`spark.mllib` supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$).
+`spark.mllib` supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.html) and [`RandomForest`](api/scala/org/apache/spark/mllib/tree/RandomForest$.html).
 Both use [decision trees](mllib-decision-tree.html) as their base models.
 
 ## Gradient-Boosted Trees vs. Random Forests
@@ -111,7 +111,7 @@ The test error is calculated to measure the algorithm accuracy.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`RandomForest` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$) and [`RandomForestModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.RandomForestModel) for details on the API.
+Refer to the [`RandomForest` Scala docs](api/scala/org/apache/spark/mllib/tree/RandomForest$.html) and [`RandomForestModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/RandomForestModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala %}
 </div>
@@ -142,7 +142,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`RandomForest` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest$) and [`RandomForestModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.RandomForestModel) for details on the API.
+Refer to the [`RandomForest` Scala docs](api/scala/org/apache/spark/mllib/tree/RandomForest$.html) and [`RandomForestModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/RandomForestModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala %}
 </div>
@@ -252,7 +252,7 @@ The test error is calculated to measure the algorithm accuracy.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`GradientBoostedTrees` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.GradientBoostedTreesModel) for details on the API.
+Refer to the [`GradientBoostedTrees` Scala docs](api/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.html) and [`GradientBoostedTreesModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/GradientBoostedTreesModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala %}
 </div>
@@ -283,7 +283,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`GradientBoostedTrees` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`GradientBoostedTreesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.tree.model.GradientBoostedTreesModel) for details on the API.
+Refer to the [`GradientBoostedTrees` Scala docs](api/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.html) and [`GradientBoostedTreesModel` Scala docs](api/scala/org/apache/spark/mllib/tree/model/GradientBoostedTreesModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala %}
 </div>
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index fb2883de6810a..f9efa769fc140 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -117,7 +117,7 @@ The following code snippets illustrate how to load a sample dataset, train a bin
 data, and evaluate the performance of the algorithm by several binary evaluation metrics.
 
 <div data-lang="scala" markdown="1">
-Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API.
+Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) and [`BinaryClassificationMetrics` Scala docs](api/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala %}
 
@@ -243,7 +243,7 @@ The following code snippets illustrate how to load a sample dataset, train a mul
 the data, and evaluate the performance of the algorithm by several multiclass classification evaluation metrics.
 
 <div data-lang="scala" markdown="1">
-Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API.
+Refer to the [`MulticlassMetrics` Scala docs](api/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala %}
 
@@ -393,7 +393,7 @@ True classes:
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API.
+Refer to the [`MultilabelMetrics` Scala docs](api/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala %}
 
@@ -521,7 +521,7 @@ expanded world of non-positive weights are "the same as never having interacted
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API.
+Refer to the [`RegressionMetrics` Scala docs](api/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Scala docs](api/scala/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala %}
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 33a223ad486af..8df9699150f6a 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -69,12 +69,12 @@ We refer users to the [Stanford NLP Group](http://nlp.stanford.edu/) and
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-TF and IDF are implemented in [HashingTF](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF)
-and [IDF](api/scala/index.html#org.apache.spark.mllib.feature.IDF).
+TF and IDF are implemented in [HashingTF](api/scala/org/apache/spark/mllib/feature/HashingTF.html)
+and [IDF](api/scala/org/apache/spark/mllib/feature/IDF.html).
 `HashingTF` takes an `RDD[Iterable[_]]` as the input.
 Each record could be an iterable of strings or other types.
 
-Refer to the [`HashingTF` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF) for details on the API.
+Refer to the [`HashingTF` Scala docs](api/scala/org/apache/spark/mllib/feature/HashingTF.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/TFIDFExample.scala %}
 </div>
@@ -135,7 +135,7 @@ Here we assume the extracted file is `text8` and in same directory as you run th
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`Word2Vec` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec) for details on the API.
+Refer to the [`Word2Vec` Scala docs](api/scala/org/apache/spark/mllib/feature/Word2Vec.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/Word2VecExample.scala %}
 </div>
@@ -159,19 +159,19 @@ against features with very large variances exerting an overly large influence du
 
 ### Model Fitting
 
-[`StandardScaler`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) has the
+[`StandardScaler`](api/scala/org/apache/spark/mllib/feature/StandardScaler.html) has the
 following parameters in the constructor:
 
 * `withMean` False by default. Centers the data with mean before scaling. It will build a dense
 output, so take care when applying to sparse input.
 * `withStd` True by default. Scales the data to unit standard deviation.
 
-We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) method in
+We provide a [`fit`](api/scala/org/apache/spark/mllib/feature/StandardScaler.html) method in
 `StandardScaler` which can take an input of `RDD[Vector]`, learn the summary statistics, and then
 return a model which can transform the input dataset into unit standard deviation and/or zero mean features
 depending how we configure the `StandardScaler`.
 
-This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
+This model implements [`VectorTransformer`](api/scala/org/apache/spark/mllib/feature/VectorTransformer.html)
 which can apply the standardization on a `Vector` to produce a transformed `Vector` or on
 an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
@@ -185,7 +185,7 @@ so that the new features have unit standard deviation and/or zero mean.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`StandardScaler` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) for details on the API.
+Refer to the [`StandardScaler` Scala docs](api/scala/org/apache/spark/mllib/feature/StandardScaler.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/StandardScalerExample.scala %}
 </div>
@@ -203,12 +203,12 @@ Normalizer scales individual samples to have unit $L^p$ norm. This is a common o
 classification or clustering. For example, the dot product of two $L^2$ normalized TF-IDF vectors
 is the cosine similarity of the vectors.
 
-[`Normalizer`](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) has the following
+[`Normalizer`](api/scala/org/apache/spark/mllib/feature/Normalizer.html) has the following
 parameter in the constructor:
 
 * `p` Normalization in $L^p$ space, $p = 2$ by default.
 
-`Normalizer` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
+`Normalizer` implements [`VectorTransformer`](api/scala/org/apache/spark/mllib/feature/VectorTransformer.html)
 which can apply the normalization on a `Vector` to produce a transformed `Vector` or on
 an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
@@ -221,7 +221,7 @@ with $L^2$ norm, and $L^\infty$ norm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-Refer to the [`Normalizer` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) for details on the API.
+Refer to the [`Normalizer` Scala docs](api/scala/org/apache/spark/mllib/feature/Normalizer.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/NormalizerExample.scala %}
 </div>
@@ -239,7 +239,7 @@ Refer to the [`Normalizer` Python docs](api/python/pyspark.mllib.html#pyspark.ml
 features for use in model construction. It reduces the size of the feature space, which can improve
 both speed and statistical learning behavior.
 
-[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements
+[`ChiSqSelector`](api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html) implements
 Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the
 [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which
 features to choose. It supports five selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`:
@@ -257,7 +257,7 @@ The number of features to select can be tuned using a held-out validation set.
 
 ### Model Fitting
 
-The [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method takes
+The [`fit`](api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html) method takes
 an input of `RDD[LabeledPoint]` with categorical features, learns the summary statistics, and then
 returns a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
 The `ChiSqSelectorModel` can be applied either to a `Vector` to produce a reduced `Vector`, or to
@@ -272,7 +272,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
+Refer to the [`ChiSqSelector` Scala docs](api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html)
 for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala %}
@@ -312,11 +312,11 @@ v_N
   \end{pmatrix}
 \]`
 
-[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
+[`ElementwiseProduct`](api/scala/org/apache/spark/mllib/feature/ElementwiseProduct.html) has the following parameter in the constructor:
 
 * `scalingVec`: the transforming vector.
 
-`ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
+`ElementwiseProduct` implements [`VectorTransformer`](api/scala/org/apache/spark/mllib/feature/VectorTransformer.html) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
 ### Example
 
@@ -325,7 +325,7 @@ This example below demonstrates how to transform vectors using a transforming ve
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [`ElementwiseProduct` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) for details on the API.
+Refer to the [`ElementwiseProduct` Scala docs](api/scala/org/apache/spark/mllib/feature/ElementwiseProduct.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala %}
 </div>
diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 8bc93ac2e8adf..709acde062d7e 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -54,18 +54,18 @@ We refer users to the papers for more details.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) implements the
+[`FPGrowth`](api/scala/org/apache/spark/mllib/fpm/FPGrowth.html) implements the
 FP-growth algorithm.
 It takes an `RDD` of transactions, where each transaction is an `Array` of items of a generic type.
 Calling `FPGrowth.run` with transactions returns an
-[`FPGrowthModel`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel)
+[`FPGrowthModel`](api/scala/org/apache/spark/mllib/fpm/FPGrowthModel.html)
 that stores the frequent itemsets with their frequencies.  The following
 example illustrates how to mine frequent itemsets and association rules
 (see [Association
 Rules](mllib-frequent-pattern-mining.html#association-rules) for
 details) from `transactions`.
 
-Refer to the [`FPGrowth` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) for details on the API.
+Refer to the [`FPGrowth` Scala docs](api/scala/org/apache/spark/mllib/fpm/FPGrowth.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala %}
 
@@ -111,7 +111,7 @@ Refer to the [`FPGrowth` Python docs](api/python/pyspark.mllib.html#pyspark.mlli
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[AssociationRules](api/scala/index.html#org.apache.spark.mllib.fpm.AssociationRules)
+[AssociationRules](api/scala/org/apache/spark/mllib/fpm/AssociationRules.html)
 implements a parallel rule generation algorithm for constructing rules
 that have a single item as the consequent.
 
@@ -168,13 +168,13 @@ The following example illustrates PrefixSpan running on the sequences
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the
+[`PrefixSpan`](api/scala/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the
 PrefixSpan algorithm.
 Calling `PrefixSpan.run` returns a
-[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel)
+[`PrefixSpanModel`](api/scala/org/apache/spark/mllib/fpm/PrefixSpanModel.html)
 that stores the frequent sequences with their frequencies.
 
-Refer to the [`PrefixSpan` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) and [`PrefixSpanModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel) for details on the API.
+Refer to the [`PrefixSpan` Scala docs](api/scala/org/apache/spark/mllib/fpm/PrefixSpan.html) and [`PrefixSpanModel` Scala docs](api/scala/org/apache/spark/mllib/fpm/PrefixSpanModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala %}
 
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index d9cc775547bb1..94ffadaf65c55 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -74,7 +74,7 @@ i.e. 4710.28,500.00. The data are split to training and testing set.
 Model is created using the training set and a mean squared error is calculated from the predicted
 labels and real labels in the test set.
 
-Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegression) and [`IsotonicRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.IsotonicRegressionModel) for details on the API.
+Refer to the [`IsotonicRegression` Scala docs](api/scala/org/apache/spark/mllib/regression/IsotonicRegression.html) and [`IsotonicRegressionModel` Scala docs](api/scala/org/apache/spark/mllib/regression/IsotonicRegressionModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala %}
 </div>
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 801876dbffa79..e7726271ccb72 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -184,7 +184,7 @@ training algorithm on this training data using a static method in the algorithm
 object, and make predictions with the resulting model to compute the training
 error.
 
-Refer to the [`SVMWithSGD` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD) and [`SVMModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMModel) for details on the API.
+Refer to the [`SVMWithSGD` Scala docs](api/scala/org/apache/spark/mllib/classification/SVMWithSGD.html) and [`SVMModel` Scala docs](api/scala/org/apache/spark/mllib/classification/SVMModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala %}
 
@@ -305,11 +305,11 @@ We recommend L-BFGS over mini-batch gradient descent for faster convergence.
 <div data-lang="scala" markdown="1">
 The following code illustrates how to load a sample multiclass dataset, split it into train and
 test, and use
-[LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)
+[LogisticRegressionWithLBFGS](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html)
 to fit a logistic regression model.
 Then the model is evaluated against the test dataset and saved to disk.
 
-Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`LogisticRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionModel) for details on the API.
+Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) and [`LogisticRegressionModel` Scala docs](api/scala/org/apache/spark/mllib/classification/LogisticRegressionModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala %}
 
@@ -438,8 +438,8 @@ regularization parameter (`regParam`) along with various parameters associated w
 gradient descent (`stepSize`, `numIterations`, `miniBatchFraction`).  For each of them, we support
 all three possible regularizations (none, L1 or L2).
 
-For Logistic Regression, [L-BFGS](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS)
-version is implemented under [LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS), and this
+For Logistic Regression, [L-BFGS](api/scala/org/apache/spark/mllib/optimization/LBFGS.html)
+version is implemented under [LogisticRegressionWithLBFGS](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html), and this
 version supports both binary and multinomial Logistic Regression while SGD version only supports
 binary Logistic Regression. However, L-BFGS version doesn't support L1 regularization but SGD one
 supports L1 regularization. When L1 regularization is not required, L-BFGS version is strongly
@@ -448,10 +448,10 @@ inverse Hessian matrix using quasi-Newton method.
 
 Algorithms are all implemented in Scala:
 
-* [SVMWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD)
-* [LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS)
-* [LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD)
-* [LinearRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD)
-* [RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
-* [LassoWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD)
+* [SVMWithSGD](api/scala/org/apache/spark/mllib/classification/SVMWithSGD.html)
+* [LogisticRegressionWithLBFGS](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html)
+* [LogisticRegressionWithSGD](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithSGD.html)
+* [LinearRegressionWithSGD](api/scala/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html)
+* [RidgeRegressionWithSGD](api/scala/org/apache/spark/mllib/regression/RidgeRegressionWithSGD.html)
+* [LassoWithSGD](api/scala/org/apache/spark/mllib/regression/LassoWithSGD.html)
 
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 09b15876a3914..a3602667b5835 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -46,14 +46,14 @@ sparsity. Since the training data is only used once, it is not necessary to cach
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
+[NaiveBayes](api/scala/org/apache/spark/mllib/classification/NaiveBayes$.html) implements
 multinomial naive Bayes. It takes an RDD of
-[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
+[LabeledPoint](api/scala/org/apache/spark/mllib/regression/LabeledPoint.html) and an optional
 smoothing parameter `lambda` as input, an optional model type parameter (default is "multinomial"), and outputs a
-[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
+[NaiveBayesModel](api/scala/org/apache/spark/mllib/classification/NaiveBayesModel.html), which
 can be used for evaluation and prediction.
 
-Refer to the [`NaiveBayes` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel) for details on the API.
+Refer to the [`NaiveBayes` Scala docs](api/scala/org/apache/spark/mllib/classification/NaiveBayes$.html) and [`NaiveBayesModel` Scala docs](api/scala/org/apache/spark/mllib/classification/NaiveBayesModel.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala %}
 </div>
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index f2e128ec215a1..f0b0c817ed3f6 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -111,12 +111,12 @@ As an alternative to just use the subgradient `$R'(\wv)$` of the regularizer in
 direction, an improved update for some cases can be obtained by using the proximal operator
 instead.
 For the L1-regularizer, the proximal operator is given by soft thresholding, as implemented in
-[L1Updater](api/scala/index.html#org.apache.spark.mllib.optimization.L1Updater).
+[L1Updater](api/scala/org/apache/spark/mllib/optimization/L1Updater.html).
 
 
 ### Update schemes for distributed SGD
 The SGD implementation in
-[GradientDescent](api/scala/index.html#org.apache.spark.mllib.optimization.GradientDescent) uses
+[GradientDescent](api/scala/org/apache/spark/mllib/optimization/GradientDescent.html) uses
 a simple (distributed) sampling of the data examples.
 We recall that the loss part of the optimization problem `$\eqref{eq:regPrimal}$` is
 `$\frac1n \sum_{i=1}^n L(\wv;\x_i,y_i)$`, and therefore `$\frac1n \sum_{i=1}^n L'_{\wv,i}$` would
@@ -169,7 +169,7 @@ are developed, see the
 section for example.
 
 The SGD class
-[GradientDescent](api/scala/index.html#org.apache.spark.mllib.optimization.GradientDescent)
+[GradientDescent](api/scala/org/apache/spark/mllib/optimization/GradientDescent.html)
 sets the following parameters:
 
 * `Gradient` is a class that computes the stochastic gradient of the function
@@ -195,15 +195,15 @@ each iteration, to compute the gradient direction.
 L-BFGS is currently only a low-level optimization primitive in `MLlib`. If you want to use L-BFGS in various 
 ML algorithms such as Linear Regression, and Logistic Regression, you have to pass the gradient of objective
 function, and updater into optimizer yourself instead of using the training APIs like 
-[LogisticRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD).
+[LogisticRegressionWithSGD](api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithSGD.html).
 See the example below. It will be addressed in the next release. 
 
 The L1 regularization by using 
-[L1Updater](api/scala/index.html#org.apache.spark.mllib.optimization.L1Updater) will not work since the 
+[L1Updater](api/scala/org/apache/spark/mllib/optimization/L1Updater.html) will not work since the 
 soft-thresholding logic in L1Updater is designed for gradient descent. See the developer's note.
 
 The L-BFGS method
-[LBFGS.runLBFGS](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS)
+[LBFGS.runLBFGS](api/scala/org/apache/spark/mllib/optimization/LBFGS.html)
 has the following parameters:
 
 * `Gradient` is a class that computes the gradient of the objective function
@@ -233,7 +233,7 @@ L-BFGS optimizer.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-Refer to the [`LBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.optimization.LBFGS) and [`SquaredL2Updater` Scala docs](api/scala/index.html#org.apache.spark.mllib.optimization.SquaredL2Updater) for details on the API.
+Refer to the [`LBFGS` Scala docs](api/scala/org/apache/spark/mllib/optimization/LBFGS.html) and [`SquaredL2Updater` Scala docs](api/scala/org/apache/spark/mllib/optimization/SquaredL2Updater.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/LBFGSExample.scala %}
 </div>
diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md
index fbc14cefd14ec..d7eb51d69a1da 100644
--- a/docs/mllib-pmml-model-export.md
+++ b/docs/mllib-pmml-model-export.md
@@ -62,7 +62,7 @@ To export a supported `model` (see table above) to PMML, simply call `model.toPM
 
 As well as exporting the PMML model to a String (`model.toPMML` as in the example above), you can export the PMML model to other formats.
 
-Refer to the [`KMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.KMeans) and [`Vectors` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) for details on the API.
+Refer to the [`KMeans` Scala docs](api/scala/org/apache/spark/mllib/clustering/KMeans.html) and [`Vectors` Scala docs](api/scala/org/apache/spark/mllib/linalg/Vectors$.html) for details on the API.
 
 Here a complete example of building a KMeansModel and print it out in PMML format:
 {% include_example scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala %}
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 4698d3e6347e7..7de214bb6b6a6 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -48,12 +48,12 @@ available in `Statistics`.
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
-[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
+[`colStats()`](api/scala/org/apache/spark/mllib/stat/Statistics$.html) returns an instance of
+[`MultivariateStatisticalSummary`](api/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
-Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
+Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
 </div>
@@ -91,11 +91,11 @@ correlation methods are currently Pearson's and Spearman's correlation.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+[`Statistics`](api/scala/org/apache/spark/mllib/stat/Statistics$.html) provides methods to
 calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
-Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) for details on the API.
+Refer to the [`Statistics` Scala docs](api/scala/org/apache/spark/mllib/stat/Statistics$.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
 </div>
@@ -137,7 +137,7 @@ python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to
+[`sampleByKeyExact()`](api/scala/org/apache/spark/rdd/PairRDDFunctions.html) allows users to
 sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
 fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
@@ -181,7 +181,7 @@ independence tests.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+[`Statistics`](api/scala/org/apache/spark/mllib/stat/Statistics$.html) provides methods to
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
 hypothesis tests.
 
@@ -221,11 +221,11 @@ message.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+[`Statistics`](api/scala/org/apache/spark/mllib/stat/Statistics$.html) provides methods to
 run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
 and interpret the hypothesis tests.
 
-Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) for details on the API.
+Refer to the [`Statistics` Scala docs](api/scala/org/apache/spark/mllib/stat/Statistics$.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
 </div>
@@ -269,7 +269,7 @@ all prior batches.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`StreamingTest`](api/scala/index.html#org.apache.spark.mllib.stat.test.StreamingTest)
+[`StreamingTest`](api/scala/org/apache/spark/mllib/stat/test/StreamingTest.html)
 provides streaming hypothesis testing.
 
 {% include_example scala/org/apache/spark/examples/mllib/StreamingTestExample.scala %}
@@ -292,12 +292,12 @@ uniform, standard normal, or Poisson.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`RandomRDDs`](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs$) provides factory
+[`RandomRDDs`](api/scala/org/apache/spark/mllib/random/RandomRDDs$.html) provides factory
 methods to generate random double RDDs or vector RDDs.
 The following example generates a random double RDD, whose values follows the standard normal
 distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
-Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs$) for details on the API.
+Refer to the [`RandomRDDs` Scala docs](api/scala/org/apache/spark/mllib/random/RandomRDDs$.html) for details on the API.
 
 {% highlight scala %}
 import org.apache.spark.SparkContext
@@ -370,11 +370,11 @@ mean of PDFs of normal distributions centered around each of the samples.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
-[`KernelDensity`](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) provides methods
+[`KernelDensity`](api/scala/org/apache/spark/mllib/stat/KernelDensity.html) provides methods
 to compute kernel density estimates from an RDD of samples. The following example demonstrates how
 to do so.
 
-Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
+Refer to the [`KernelDensity` Scala docs](api/scala/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
 
 {% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
 </div>
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 93abb25f20f47..86ba2c44b0e02 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -57,7 +57,7 @@ scala> val textFile = spark.read.textFile("README.md")
 textFile: org.apache.spark.sql.Dataset[String] = [value: string]
 {% endhighlight %}
 
-You can get values from Dataset directly, by calling some actions, or transform the Dataset to get a new one. For more details, please read the _[API doc](api/scala/index.html#org.apache.spark.sql.Dataset)_.
+You can get values from Dataset directly, by calling some actions, or transform the Dataset to get a new one. For more details, please read the _[API doc](api/scala/org/apache/spark/sql/Dataset.html)_.
 
 {% highlight scala %}
 scala> textFile.count() // Number of items in this Dataset
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 5e55c93c4148e..ba99007aaf639 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -149,8 +149,8 @@ $ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/pytho
 
 <div data-lang="scala"  markdown="1">
 
-The first thing a Spark program must do is to create a [SparkContext](api/scala/index.html#org.apache.spark.SparkContext) object, which tells Spark
-how to access a cluster. To create a `SparkContext` you first need to build a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object
+The first thing a Spark program must do is to create a [SparkContext](api/scala/org/apache/spark/SparkContext.html) object, which tells Spark
+how to access a cluster. To create a `SparkContext` you first need to build a [SparkConf](api/scala/org/apache/spark/SparkConf.html) object
 that contains information about your application.
 
 Only one SparkContext should be active per JVM. You must `stop()` the active SparkContext before creating a new one.
@@ -500,7 +500,7 @@ then this approach should work well for such cases.
 
 If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to
 transform that data on the Scala/Java side to something which can be handled by Pyrolite's pickler.
-A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided
+A [Converter](api/scala/org/apache/spark/api/python/Converter.html) trait is provided
 for this. Simply extend this trait and implement your transformation code in the ```convert```
 method. Remember to ensure that this class, along with any dependencies required to access your ```InputFormat```, are packaged into your Spark job jar and included on the PySpark
 classpath.
@@ -856,7 +856,7 @@ by a key.
 In Scala, these operations are automatically available on RDDs containing
 [Tuple2](http://www.scala-lang.org/api/{{site.SCALA_VERSION}}/index.html#scala.Tuple2) objects
 (the built-in tuples in the language, created by simply writing `(a, b)`). The key-value pair operations are available in the
-[PairRDDFunctions](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) class,
+[PairRDDFunctions](api/scala/org/apache/spark/rdd/PairRDDFunctions.html) class,
 which automatically wraps around an RDD of tuples.
 
 For example, the following code uses the `reduceByKey` operation on key-value pairs to count how
@@ -946,12 +946,12 @@ We could also use `counts.sortByKey()`, for example, to sort the pairs alphabeti
 
 The following table lists some of the common transformations supported by Spark. Refer to the
 RDD API doc
-([Scala](api/scala/index.html#org.apache.spark.rdd.RDD),
+([Scala](api/scala/org/apache/spark/rdd/RDD.html),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
  [Python](api/python/pyspark.html#pyspark.RDD),
  [R](api/R/index.html))
 and pair RDD functions doc
-([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
+([Scala](api/scala/org/apache/spark/rdd/PairRDDFunctions.html),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
 for details.
 
@@ -1060,13 +1060,13 @@ for details.
 
 The following table lists some of the common actions supported by Spark. Refer to the
 RDD API doc
-([Scala](api/scala/index.html#org.apache.spark.rdd.RDD),
+([Scala](api/scala/org/apache/spark/rdd/RDD.html),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
  [Python](api/python/pyspark.html#pyspark.RDD),
  [R](api/R/index.html))
 
 and pair RDD functions doc
-([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
+([Scala](api/scala/org/apache/spark/rdd/PairRDDFunctions.html),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
 for details.
 
@@ -1208,7 +1208,7 @@ In addition, each persisted RDD can be stored using a different *storage level*,
 to persist the dataset on disk, persist it in memory but as serialized Java objects (to save space),
 replicate it across nodes.
 These levels are set by passing a
-`StorageLevel` object ([Scala](api/scala/index.html#org.apache.spark.storage.StorageLevel),
+`StorageLevel` object ([Scala](api/scala/org/apache/spark/storage/StorageLevel.html),
 [Java](api/java/index.html?org/apache/spark/storage/StorageLevel.html),
 [Python](api/python/pyspark.html#pyspark.StorageLevel))
 to `persist()`. The `cache()` method is a shorthand for using the default storage level,
@@ -1404,11 +1404,11 @@ res2: Long = 10
 {% endhighlight %}
 
 While this code used the built-in support for accumulators of type Long, programmers can also
-create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
+create their own types by subclassing [AccumulatorV2](api/scala/org/apache/spark/util/AccumulatorV2.html).
 The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
 the accumulator to zero, `add` for adding another value into the accumulator,
 `merge` for merging another same-type accumulator into this one. Other methods that must be overridden
-are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
+are contained in the [API documentation](api/scala/org/apache/spark/util/AccumulatorV2.html). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight scala %}
@@ -1457,11 +1457,11 @@ accum.value();
 {% endhighlight %}
 
 While this code used the built-in support for accumulators of type Long, programmers can also
-create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
+create their own types by subclassing [AccumulatorV2](api/scala/org/apache/spark/util/AccumulatorV2.html).
 The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
 the accumulator to zero, `add` for adding another value into the accumulator,
 `merge` for merging another same-type accumulator into this one. Other methods that must be overridden
-are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
+are contained in the [API documentation](api/scala/org/apache/spark/util/AccumulatorV2.html). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight java %}
@@ -1620,4 +1620,4 @@ For help on deploying, the [cluster mode overview](cluster-overview.html) descri
 in distributed operation and supported cluster managers.
 
 Finally, full API documentation is available in
-[Scala](api/scala/#org.apache.spark.package), [Java](api/java/), [Python](api/python/) and [R](api/R/).
+[Scala](api/scala/org/apache/spark/), [Java](api/java/), [Python](api/python/) and [R](api/R/).
diff --git a/docs/sql-data-sources-generic-options.md b/docs/sql-data-sources-generic-options.md
index 0cfe2ed1aa891..6bcf48235bced 100644
--- a/docs/sql-data-sources-generic-options.md
+++ b/docs/sql-data-sources-generic-options.md
@@ -118,4 +118,4 @@ To load all files recursively, you can use:
 <div data-lang="r"  markdown="1">
 {% include_example recursive_file_lookup r/RSparkSQLExample.R %}
 </div>
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md
index b0d37b11c7117..3cdff42057ce1 100644
--- a/docs/sql-data-sources-jdbc.md
+++ b/docs/sql-data-sources-jdbc.md
@@ -23,7 +23,7 @@ license: |
 {:toc}
 
 Spark SQL also includes a data source that can read data from other databases using JDBC. This
-functionality should be preferred over using [JdbcRDD](api/scala/index.html#org.apache.spark.rdd.JdbcRDD).
+functionality should be preferred over using [JdbcRDD](api/scala/org/apache/spark/rdd/JdbcRDD.html).
 This is because the results are returned
 as a DataFrame and they can easily be processed in Spark SQL or joined with other data sources.
 The JDBC data source is also easier to use from Java or Python as it does not require the user to
diff --git a/docs/sql-data-sources-json.md b/docs/sql-data-sources-json.md
index 4ce4897189846..588f6cdaa0ad5 100644
--- a/docs/sql-data-sources-json.md
+++ b/docs/sql-data-sources-json.md
@@ -93,4 +93,4 @@ SELECT * FROM jsonTable
 
 </div>
 
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/sql-getting-started.md b/docs/sql-getting-started.md
index f637697a29ad7..9df0f768268d0 100644
--- a/docs/sql-getting-started.md
+++ b/docs/sql-getting-started.md
@@ -27,7 +27,7 @@ license: |
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
 
-The entry point into all functionality in Spark is the [`SparkSession`](api/scala/index.html#org.apache.spark.sql.SparkSession) class. To create a basic `SparkSession`, just use `SparkSession.builder()`:
+The entry point into all functionality in Spark is the [`SparkSession`](api/scala/org/apache/spark/sql/SparkSession.html) class. To create a basic `SparkSession`, just use `SparkSession.builder()`:
 
 {% include_example init_session scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
 </div>
@@ -104,7 +104,7 @@ As an example, the following creates a DataFrame based on the content of a JSON
 
 ## Untyped Dataset Operations (aka DataFrame Operations)
 
-DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/SparkDataFrame.html).
+DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/org/apache/spark/sql/Dataset.html), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/SparkDataFrame.html).
 
 As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in Scala and Java API. These operations are also referred as "untyped transformations" in contrast to "typed transformations" come with strongly typed Scala/Java Datasets.
 
@@ -114,9 +114,9 @@ Here we include some basic examples of structured data processing using Datasets
 <div data-lang="scala"  markdown="1">
 {% include_example untyped_ops scala/org/apache/spark/examples/sql/SparkSQLExample.scala %}
 
-For a complete list of the types of operations that can be performed on a Dataset, refer to the [API Documentation](api/scala/index.html#org.apache.spark.sql.Dataset).
+For a complete list of the types of operations that can be performed on a Dataset, refer to the [API Documentation](api/scala/org/apache/spark/sql/Dataset.html).
 
-In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/scala/index.html#org.apache.spark.sql.functions$).
+In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the [DataFrame Function Reference](api/scala/org/apache/spark/sql/functions$.html).
 </div>
 
 <div data-lang="java" markdown="1">
@@ -222,7 +222,7 @@ SELECT * FROM global_temp.temp_view
 ## Creating Datasets
 
 Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use
-a specialized [Encoder](api/scala/index.html#org.apache.spark.sql.Encoder) to serialize the objects
+a specialized [Encoder](api/scala/org/apache/spark/sql/Encoder.html) to serialize the objects
 for processing or transmitting over the network. While both encoders and standard serialization are
 responsible for turning an object into bytes, encoders are code generated dynamically and use a format
 that allows Spark to perform many operations like filtering, sorting and hashing without deserializing
@@ -351,16 +351,16 @@ For example:
 
 ## Aggregations
 
-The [built-in DataFrames functions](api/scala/index.html#org.apache.spark.sql.functions$) provide common
+The [built-in DataFrames functions](api/scala/org/apache/spark/sql/functions$.html) provide common
 aggregations such as `count()`, `countDistinct()`, `avg()`, `max()`, `min()`, etc.
 While those functions are designed for DataFrames, Spark SQL also has type-safe versions for some of them in
-[Scala](api/scala/index.html#org.apache.spark.sql.expressions.scalalang.typed$) and
+[Scala](api/scala/org/apache/spark/sql/expressions/scalalang/typed$.html) and
 [Java](api/java/org/apache/spark/sql/expressions/javalang/typed.html) to work with strongly typed Datasets.
 Moreover, users are not limited to the predefined aggregate functions and can create their own.
 
 ### Type-Safe User-Defined Aggregate Functions
 
-User-defined aggregations for strongly typed Datasets revolve around the [Aggregator](api/scala/index.html#org.apache.spark.sql.expressions.Aggregator) abstract class.
+User-defined aggregations for strongly typed Datasets revolve around the [Aggregator](api/scala/org/apache/spark/sql/expressions/Aggregator.html) abstract class.
 For example, a type-safe user-defined average can look like:
 
 <div class="codetabs">
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 46b741687363f..11afae14d0205 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -737,11 +737,11 @@ and writing data out (`DataFrame.write`),
 and deprecated the old APIs (e.g., `SQLContext.parquetFile`, `SQLContext.jsonFile`).
 
 See the API docs for `SQLContext.read` (
-  <a href="api/scala/index.html#org.apache.spark.sql.SQLContext@read:DataFrameReader">Scala</a>,
+  <a href="api/scala/org/apache/spark/sql/SQLContext.html#read:DataFrameReader">Scala</a>,
   <a href="api/java/org/apache/spark/sql/SQLContext.html#read()">Java</a>,
   <a href="api/python/pyspark.sql.html#pyspark.sql.SQLContext.read">Python</a>
 ) and `DataFrame.write` (
-  <a href="api/scala/index.html#org.apache.spark.sql.DataFrame@write:DataFrameWriter">Scala</a>,
+  <a href="api/scala/org/apache/spark/sql/DataFrame.html#write:DataFrameWriter">Scala</a>,
   <a href="api/java/org/apache/spark/sql/Dataset.html#write()">Java</a>,
   <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame.write">Python</a>
 ) more information.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0a4d07ea37b69..06bf553e4f704 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -61,7 +61,7 @@ In Scala and Java, a DataFrame is represented by a Dataset of `Row`s.
 In [the Scala API][scala-datasets], `DataFrame` is simply a type alias of `Dataset[Row]`.
 While, in [Java API][java-datasets], users need to use `Dataset<Row>` to represent a `DataFrame`.
 
-[scala-datasets]: api/scala/index.html#org.apache.spark.sql.Dataset
+[scala-datasets]: api/scala/org/apache/spark/sql/Dataset.html
 [java-datasets]: api/java/index.html?org/apache/spark/sql/Dataset.html
 
 Throughout this document, we will often refer to Scala/Java Datasets of `Row`s as DataFrames.
diff --git a/docs/sql-ref-syntax-aux-analyze-table.md b/docs/sql-ref-syntax-aux-analyze-table.md
index e448ca38c849e..40513e836b026 100644
--- a/docs/sql-ref-syntax-aux-analyze-table.md
+++ b/docs/sql-ref-syntax-aux-analyze-table.md
@@ -106,4 +106,4 @@ ANALYZE TABLE table_identifier [ partition_spec ]
      max_col_len	13
      histogram	NULL
 
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}
diff --git a/docs/sql-ref-syntax-aux-cache-refresh.md b/docs/sql-ref-syntax-aux-cache-refresh.md
index fea3100d96f01..4c56893aeca98 100644
--- a/docs/sql-ref-syntax-aux-cache-refresh.md
+++ b/docs/sql-ref-syntax-aux-cache-refresh.md
@@ -51,4 +51,4 @@ REFRESH "hdfs://path/to/table";
 - [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html)
 - [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html)
 - [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html)
-- [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html)
\ No newline at end of file
+- [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html)
diff --git a/docs/sql-ref-syntax-aux-refresh-table.md b/docs/sql-ref-syntax-aux-refresh-table.md
index 4b3fcdd65ba75..96d698a1d68b0 100644
--- a/docs/sql-ref-syntax-aux-refresh-table.md
+++ b/docs/sql-ref-syntax-aux-refresh-table.md
@@ -55,4 +55,4 @@ REFRESH TABLE tempDB.view1;
 ### Related Statements
 - [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html)
 - [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html)
-- [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html)
\ No newline at end of file
+- [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html)
diff --git a/docs/sql-ref-syntax-aux-resource-mgmt.md b/docs/sql-ref-syntax-aux-resource-mgmt.md
index 5eddb979388aa..0885f56bdb7cf 100644
--- a/docs/sql-ref-syntax-aux-resource-mgmt.md
+++ b/docs/sql-ref-syntax-aux-resource-mgmt.md
@@ -22,4 +22,4 @@ license: |
 * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html)
 * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html)
 * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html)
-* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html)
\ No newline at end of file
+* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html)
diff --git a/docs/sql-ref-syntax-aux-show-tables.md b/docs/sql-ref-syntax-aux-show-tables.md
index 41af8e7d607d0..2a078abf911e7 100644
--- a/docs/sql-ref-syntax-aux-show-tables.md
+++ b/docs/sql-ref-syntax-aux-show-tables.md
@@ -104,4 +104,4 @@ SHOW TABLES  LIKE 'sam*|suj';
 - [CREATE TABLE](sql-ref-syntax-ddl-create-table.html)
 - [DROP TABLE](sql-ref-syntax-ddl-drop-table.html)
 - [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html)
-- [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html)
\ No newline at end of file
+- [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html)
diff --git a/docs/sql-ref-syntax-aux-show.md b/docs/sql-ref-syntax-aux-show.md
index ad5c3954c23fa..f6d700acf9eef 100644
--- a/docs/sql-ref-syntax-aux-show.md
+++ b/docs/sql-ref-syntax-aux-show.md
@@ -25,4 +25,4 @@ license: |
 * [SHOW TABLES](sql-ref-syntax-aux-show-tables.html)
 * [SHOW TBLPROPERTIES](sql-ref-syntax-aux-show-tblproperties.html)
 * [SHOW PARTITIONS](sql-ref-syntax-aux-show-partitions.html)
-* [SHOW CREATE TABLE](sql-ref-syntax-aux-show-create-table.html)
\ No newline at end of file
+* [SHOW CREATE TABLE](sql-ref-syntax-aux-show-create-table.html)
diff --git a/docs/sql-ref-syntax-ddl-drop-database.md b/docs/sql-ref-syntax-ddl-drop-database.md
index 68244648c9a5b..0bdb98f2b129c 100644
--- a/docs/sql-ref-syntax-ddl-drop-database.md
+++ b/docs/sql-ref-syntax-ddl-drop-database.md
@@ -77,4 +77,4 @@ DROP DATABASE IF EXISTS inventory_db CASCADE;
 ### Related statements
 - [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html)
 - [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html)
-- [SHOW DATABASES](sql-ref-syntax-aux-show-databases.html)
\ No newline at end of file
+- [SHOW DATABASES](sql-ref-syntax-aux-show-databases.html)
diff --git a/docs/sql-ref-syntax-ddl-drop-function.md b/docs/sql-ref-syntax-ddl-drop-function.md
index a059960920bef..16d08d1ae8e99 100644
--- a/docs/sql-ref-syntax-ddl-drop-function.md
+++ b/docs/sql-ref-syntax-ddl-drop-function.md
@@ -102,4 +102,4 @@ DROP TEMPORARY FUNCTION IF EXISTS test_avg;
 ### Related statements
 - [CREATE FUNCTION](sql-ref-syntax-ddl-create-function.html)
 - [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html)
-- [SHOW FUNCTION](sql-ref-syntax-aux-show-functions.html)
\ No newline at end of file
+- [SHOW FUNCTION](sql-ref-syntax-aux-show-functions.html)
diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md b/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md
index 6d56cdeb051c6..3b0475aef1015 100644
--- a/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md
+++ b/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md
@@ -84,4 +84,4 @@ INSERT OVERWRITE [ LOCAL ] DIRECTORY directory_path
 ### Related Statements
   * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html)
   * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html)
-  * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html)
\ No newline at end of file
+  * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html)
diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-directory.md b/docs/sql-ref-syntax-dml-insert-overwrite-directory.md
index 06000b593bda4..6f19c62bc3d56 100644
--- a/docs/sql-ref-syntax-dml-insert-overwrite-directory.md
+++ b/docs/sql-ref-syntax-dml-insert-overwrite-directory.md
@@ -82,4 +82,4 @@ INSERT OVERWRITE DIRECTORY
 ### Related Statements
   * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html)
   * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html)
-  * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html)
\ No newline at end of file
+  * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html)
diff --git a/docs/sql-ref-syntax-dml.md b/docs/sql-ref-syntax-dml.md
index af09e2054562e..b5dd45f8962c9 100644
--- a/docs/sql-ref-syntax-dml.md
+++ b/docs/sql-ref-syntax-dml.md
@@ -22,4 +22,4 @@ license: |
 Data Manipulation Statements are used to add, change, or delete data. Spark SQL supports the following Data Manipulation Statements:
 
 - [INSERT](sql-ref-syntax-dml-insert.html)
-- [LOAD](sql-ref-syntax-dml-load.html)
\ No newline at end of file
+- [LOAD](sql-ref-syntax-dml-load.html)
diff --git a/docs/sql-ref-syntax-qry-select-clusterby.md b/docs/sql-ref-syntax-qry-select-clusterby.md
index c96c4419212bb..bb60e8bfe25fc 100644
--- a/docs/sql-ref-syntax-qry-select-clusterby.md
+++ b/docs/sql-ref-syntax-qry-select-clusterby.md
@@ -96,4 +96,4 @@ SELECT age, name FROM person CLUSTER BY age;
 - [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html)
 - [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html)
 - [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html)
-- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
\ No newline at end of file
+- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
diff --git a/docs/sql-ref-syntax-qry-select-distribute-by.md b/docs/sql-ref-syntax-qry-select-distribute-by.md
index e706ccf03917b..5ade9c16e4a0f 100644
--- a/docs/sql-ref-syntax-qry-select-distribute-by.md
+++ b/docs/sql-ref-syntax-qry-select-distribute-by.md
@@ -91,4 +91,4 @@ SELECT age, name FROM person DISTRIBUTE BY age;
 - [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html)
 - [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html)
 - [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html)
-- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
\ No newline at end of file
+- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
diff --git a/docs/sql-ref-syntax-qry-select-sortby.md b/docs/sql-ref-syntax-qry-select-sortby.md
index c0a491b78ee83..1818a69f4a34b 100644
--- a/docs/sql-ref-syntax-qry-select-sortby.md
+++ b/docs/sql-ref-syntax-qry-select-sortby.md
@@ -184,4 +184,4 @@ SELECT /*+ REPARTITION(zip_code) */ name, age, zip_code FROM person
 - [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html)
 - [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html)
 - [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html)
-- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
\ No newline at end of file
+- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md
index 00bd719004fb8..80b930f335d1e 100644
--- a/docs/sql-ref-syntax-qry-select.md
+++ b/docs/sql-ref-syntax-qry-select.md
@@ -143,4 +143,4 @@ SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] }
 - [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html)
 - [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html)
 - [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html)
-- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
\ No newline at end of file
+- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index a37b961243f47..6ce73b27d11ba 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -28,7 +28,7 @@ in Scala or Java.
 ## Implementing a Custom Receiver
 
 This starts with implementing a **Receiver**
-([Scala doc](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver),
+([Scala doc](api/scala/org/apache/spark/streaming/receiver/Receiver.html),
 [Java doc](api/java/org/apache/spark/streaming/receiver/Receiver.html)).
 A custom receiver must extend this abstract class by implementing two methods
 
diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index 6fa363285f385..3282124b0be5d 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -23,4 +23,4 @@ replicated commit log service.  Please read the [Kafka documentation](https://ka
 thoroughly before starting an integration using Spark.
 
 At the moment, Spark requires Kafka 0.10 and higher. See 
-<a href="streaming-kafka-0-10-integration.html">Kafka 0.10 integration documentation</a> for details.
\ No newline at end of file
+<a href="streaming-kafka-0-10-integration.html">Kafka 0.10 integration documentation</a> for details.
diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index 3389d453c2cbd..c81597160ca09 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -59,7 +59,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
                 .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
                 .build()
 
-	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kinesis.KinesisInputDStream)
+	See the [API docs](api/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.html)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala). Refer to the [Running the Example](#running-the-example) subsection for instructions on how to run the example.
 
 	</div>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index f6b579fbf74d1..ac4aa9255ae68 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -57,7 +57,7 @@ Spark Streaming provides a high-level abstraction called *discretized stream* or
 which represents a continuous stream of data. DStreams can be created either from input data
 streams from sources such as Kafka, and Kinesis, or by applying high-level
 operations on other DStreams. Internally, a DStream is represented as a sequence of
-[RDDs](api/scala/index.html#org.apache.spark.rdd.RDD).
+[RDDs](api/scala/org/apache/spark/rdd/RDD.html).
 
 This guide shows you how to start writing Spark Streaming programs with DStreams. You can
 write Spark Streaming programs in Scala, Java or Python (introduced in Spark 1.2),
@@ -80,7 +80,7 @@ do is as follows.
 <div data-lang="scala"  markdown="1" >
 First, we import the names of the Spark Streaming classes and some implicit
 conversions from StreamingContext into our environment in order to add useful methods to
-other classes we need (like DStream). [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) is the
+other classes we need (like DStream). [StreamingContext](api/scala/org/apache/spark/streaming/StreamingContext.html) is the
 main entry point for all streaming functionality. We create a local StreamingContext with two execution threads,  and a batch interval of 1 second.
 
 {% highlight scala %}
@@ -185,7 +185,7 @@ JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split(" ")).itera
 generating multiple new records from each record in the source DStream. In this case,
 each line will be split into multiple words and the stream of words is represented as the
 `words` DStream. Note that we defined the transformation using a
-[FlatMapFunction](api/scala/index.html#org.apache.spark.api.java.function.FlatMapFunction) object.
+[FlatMapFunction](api/scala/org/apache/spark/api/java/function/FlatMapFunction.html) object.
 As we will discover along the way, there are a number of such convenience classes in the Java API
 that help defines DStream transformations.
 
@@ -201,9 +201,9 @@ wordCounts.print();
 {% endhighlight %}
 
 The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word,
-1)` pairs, using a [PairFunction](api/scala/index.html#org.apache.spark.api.java.function.PairFunction)
+1)` pairs, using a [PairFunction](api/scala/org/apache/spark/api/java/function/PairFunction.html)
 object. Then, it is reduced to get the frequency of words in each batch of data,
-using a [Function2](api/scala/index.html#org.apache.spark.api.java.function.Function2) object.
+using a [Function2](api/scala/org/apache/spark/api/java/function/Function2.html) object.
 Finally, `wordCounts.print()` will print a few of the counts generated every second.
 
 Note that when these lines are executed, Spark Streaming only sets up the computation it
@@ -435,7 +435,7 @@ To initialize a Spark Streaming program, a **StreamingContext** object has to be
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-A [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) object can be created from a [SparkConf](api/scala/index.html#org.apache.spark.SparkConf) object.
+A [StreamingContext](api/scala/org/apache/spark/streaming/StreamingContext.html) object can be created from a [SparkConf](api/scala/org/apache/spark/SparkConf.html) object.
 
 {% highlight scala %}
 import org.apache.spark._
@@ -451,7 +451,7 @@ or a special __"local[\*]"__ string to run in local mode. In practice, when runn
 you will not want to hardcode `master` in the program,
 but rather [launch the application with `spark-submit`](submitting-applications.html) and
 receive it there. However, for local testing and unit tests, you can pass "local[\*]" to run Spark Streaming
-in-process (detects the number of cores in the local system). Note that this internally creates a [SparkContext](api/scala/index.html#org.apache.spark.SparkContext) (starting point of all Spark functionality) which can be accessed as `ssc.sparkContext`.
+in-process (detects the number of cores in the local system). Note that this internally creates a [SparkContext](api/scala/org/apache/spark/SparkContext.html) (starting point of all Spark functionality) which can be accessed as `ssc.sparkContext`.
 
 The batch interval must be set based on the latency requirements of your application
 and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-interval)
@@ -584,7 +584,7 @@ Input DStreams are DStreams representing the stream of input data received from
 sources. In the [quick example](#a-quick-example), `lines` was an input DStream as it represented
 the stream of data received from the netcat server. Every input DStream
 (except file stream, discussed later in this section) is associated with a **Receiver**
-([Scala doc](api/scala/index.html#org.apache.spark.streaming.receiver.Receiver),
+([Scala doc](api/scala/org/apache/spark/streaming/receiver/Receiver.html),
 [Java doc](api/java/org/apache/spark/streaming/receiver/Receiver.html)) object which receives the
 data from a source and stores it in Spark's memory for processing.
 
@@ -739,7 +739,7 @@ DStreams can be created with data streams received through custom receivers. See
 For testing a Spark Streaming application with test data, one can also create a DStream based on a queue of RDDs, using `streamingContext.queueStream(queueOfRDDs)`. Each RDD pushed into the queue will be treated as a batch of data in the DStream, and processed like a stream.
 
 For more details on streams from sockets and files, see the API documentations of the relevant functions in
-[StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) for
+[StreamingContext](api/scala/org/apache/spark/streaming/StreamingContext.html) for
 Scala, [JavaStreamingContext](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html)
 for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.streaming.StreamingContext) for Python.
 
@@ -1219,8 +1219,8 @@ joinedStream = windowedStream.transform(lambda rdd: rdd.join(dataset))
 In fact, you can also dynamically change the dataset you want to join against. The function provided to `transform` is evaluated every batch interval and therefore will use the current dataset that `dataset` reference points to.
 
 The complete list of DStream transformations is available in the API documentation. For the Scala API,
-see [DStream](api/scala/index.html#org.apache.spark.streaming.dstream.DStream)
-and [PairDStreamFunctions](api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions).
+see [DStream](api/scala/org/apache/spark/streaming/dstream/DStream.html)
+and [PairDStreamFunctions](api/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.html).
 For the Java API, see [JavaDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaDStream.html)
 and [JavaPairDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaPairDStream.html).
 For the Python API, see [DStream](api/python/pyspark.streaming.html#pyspark.streaming.DStream).
@@ -2067,7 +2067,7 @@ for prime time, the old one be can be brought down. Note that this can be done f
 sending the data to two destinations (i.e., the earlier and upgraded applications).
 
 - The existing application is shutdown gracefully (see
-[`StreamingContext.stop(...)`](api/scala/index.html#org.apache.spark.streaming.StreamingContext)
+[`StreamingContext.stop(...)`](api/scala/org/apache/spark/streaming/StreamingContext.html)
 or [`JavaStreamingContext.stop(...)`](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html)
 for graceful shutdown options) which ensure data that has been received is completely
 processed before shutdown. Then the
@@ -2104,7 +2104,7 @@ In that case, consider
 [reducing](#reducing-the-batch-processing-times) the batch processing time.
 
 The progress of a Spark Streaming program can also be monitored using the
-[StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener) interface,
+[StreamingListener](api/scala/org/apache/spark/streaming/scheduler/StreamingListener.html) interface,
 which allows you to get receiver status and processing times. Note that this is a developer API
 and it is likely to be improved upon (i.e., more information reported) in the future.
 
@@ -2197,7 +2197,7 @@ computation is not high enough. For example, for distributed reduce operations l
 and `reduceByKeyAndWindow`, the default number of parallel tasks is controlled by
 the `spark.default.parallelism` [configuration property](configuration.html#spark-properties). You
 can pass the level of parallelism as an argument (see
-[`PairDStreamFunctions`](api/scala/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions)
+[`PairDStreamFunctions`](api/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.html)
 documentation), or set the `spark.default.parallelism`
 [configuration property](configuration.html#spark-properties) to change the default.
 
@@ -2205,9 +2205,9 @@ documentation), or set the `spark.default.parallelism`
 {:.no_toc}
 The overheads of data serialization can be reduced by tuning the serialization formats. In the case of streaming, there are two types of data that are being serialized.
 
-* **Input data**: By default, the input data received through Receivers is stored in the executors' memory with [StorageLevel.MEMORY_AND_DISK_SER_2](api/scala/index.html#org.apache.spark.storage.StorageLevel$). That is, the data is serialized into bytes to reduce GC overheads, and replicated for tolerating executor failures. Also, the data is kept first in memory, and spilled over to disk only if the memory is insufficient to hold all of the input data necessary for the streaming computation. This serialization obviously has overheads -- the receiver must deserialize the received data and re-serialize it using Spark's serialization format. 
+* **Input data**: By default, the input data received through Receivers is stored in the executors' memory with [StorageLevel.MEMORY_AND_DISK_SER_2](api/scala/org/apache/spark/storage/StorageLevel$.html). That is, the data is serialized into bytes to reduce GC overheads, and replicated for tolerating executor failures. Also, the data is kept first in memory, and spilled over to disk only if the memory is insufficient to hold all of the input data necessary for the streaming computation. This serialization obviously has overheads -- the receiver must deserialize the received data and re-serialize it using Spark's serialization format.
 
-* **Persisted RDDs generated by Streaming Operations**: RDDs generated by streaming computations may be persisted in memory. For example, window operations persist data in memory as they would be processed multiple times. However, unlike the Spark Core default of [StorageLevel.MEMORY_ONLY](api/scala/index.html#org.apache.spark.storage.StorageLevel$), persisted RDDs generated by streaming computations are persisted with [StorageLevel.MEMORY_ONLY_SER](api/scala/index.html#org.apache.spark.storage.StorageLevel$) (i.e. serialized) by default to minimize GC overheads.
+* **Persisted RDDs generated by Streaming Operations**: RDDs generated by streaming computations may be persisted in memory. For example, window operations persist data in memory as they would be processed multiple times. However, unlike the Spark Core default of [StorageLevel.MEMORY_ONLY](api/scala/org/apache/spark/storage/StorageLevel$.html), persisted RDDs generated by streaming computations are persisted with [StorageLevel.MEMORY_ONLY_SER](api/scala/org/apache/spark/storage/StorageLevel.html$) (i.e. serialized) by default to minimize GC overheads.
 
 In both cases, using Kryo serialization can reduce both CPU and memory overheads. See the [Spark Tuning Guide](tuning.html#data-serialization) for more details. For Kryo, consider registering custom classes, and disabling object reference tracking (see Kryo-related configurations in the [Configuration Guide](configuration.html#compression-and-serialization)).
 
@@ -2247,7 +2247,7 @@ A good approach to figure out the right batch size for your application is to te
 conservative batch interval (say, 5-10 seconds) and a low data rate. To verify whether the system
 is able to keep up with the data rate, you can check the value of the end-to-end delay experienced
 by each processed batch (either look for "Total delay" in Spark driver log4j logs, or use the
-[StreamingListener](api/scala/index.html#org.apache.spark.streaming.scheduler.StreamingListener)
+[StreamingListener](api/scala/org/apache/spark/streaming/scheduler/StreamingListener.html)
 interface).
 If the delay is maintained to be comparable to the batch size, then system is stable. Otherwise,
 if the delay is continuously increasing, it means that the system is unable to keep up and it
@@ -2485,10 +2485,10 @@ additional effort may be necessary to achieve exactly-once semantics. There are
 * Third-party DStream data sources can be found in [Third Party Projects](https://spark.apache.org/third-party-projects.html)
 * API documentation
   - Scala docs
-    * [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) and
-  [DStream](api/scala/index.html#org.apache.spark.streaming.dstream.DStream)
-    * [KafkaUtils](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$),
-    [KinesisUtils](api/scala/index.html#org.apache.spark.streaming.kinesis.KinesisInputDStream),
+    * [StreamingContext](api/scala/org/apache/spark/streaming/StreamingContext.html) and
+  [DStream](api/scala/org/apache/spark/streaming/dstream/DStream.html)
+    * [KafkaUtils](api/scala/org/apache/spark/streaming/kafka/KafkaUtils$.html),
+    [KinesisUtils](api/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.html),
   - Java docs
     * [JavaStreamingContext](api/java/index.html?org/apache/spark/streaming/api/java/JavaStreamingContext.html),
     [JavaDStream](api/java/index.html?org/apache/spark/streaming/api/java/JavaDStream.html) and
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 727bda18a14fb..9f4edd73c2b65 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -498,13 +498,13 @@ to track the read position in the stream. The engine uses checkpointing and writ
 
 # API using Datasets and DataFrames
 Since Spark 2.0, DataFrames and Datasets can represent static, bounded data, as well as streaming, unbounded data. Similar to static Datasets/DataFrames, you can use the common entry point `SparkSession`
-([Scala](api/scala/index.html#org.apache.spark.sql.SparkSession)/[Java](api/java/org/apache/spark/sql/SparkSession.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.SparkSession)/[R](api/R/sparkR.session.html) docs)
+([Scala](api/scala/org/apache/spark/sql/SparkSession.html)/[Java](api/java/org/apache/spark/sql/SparkSession.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.SparkSession)/[R](api/R/sparkR.session.html) docs)
 to create streaming DataFrames/Datasets from streaming sources, and apply the same operations on them as static DataFrames/Datasets. If you are not familiar with Datasets/DataFrames, you are strongly advised to familiarize yourself with them using the
 [DataFrame/Dataset Programming Guide](sql-programming-guide.html).
 
 ## Creating streaming DataFrames and streaming Datasets
 Streaming DataFrames can be created through the `DataStreamReader` interface
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamReader.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader) docs)
+([Scala](api/scala/org/apache/spark/sql/streaming/DataStreamReader.html)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamReader.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader) docs)
 returned by `SparkSession.readStream()`. In [R](api/R/read.stream.html), with the `read.stream()` method. Similar to the read interface for creating static DataFrame, you can specify the details of the source – data format, schema, options, etc.
 
 #### Input Sources
@@ -557,7 +557,7 @@ Here are the details of all the sources in Spark.
         NOTE 3: Both delete and move actions are best effort. Failing to delete or move files will not fail the streaming query. Spark may not clean up some source files in some circumstances - e.g. the application doesn't shut down gracefully, too many files are queued to clean up.
         <br/><br/>
         For file-format-specific options, see the related methods in <code>DataStreamReader</code>
-        (<a href="api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader">Scala</a>/<a href="api/java/org/apache/spark/sql/streaming/DataStreamReader.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader">Python</a>/<a
+        (<a href="api/scala/org/apache/spark/sql/streaming/DataStreamReader.html">Scala</a>/<a href="api/java/org/apache/spark/sql/streaming/DataStreamReader.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader">Python</a>/<a
         href="api/R/read.stream.html">R</a>).
         E.g. for "parquet" format options see <code>DataStreamReader.parquet()</code>.
         <br/><br/>
@@ -1622,7 +1622,7 @@ However, as a side effect, data from the slower streams will be aggressively dro
 this configuration judiciously.
 
 ### Arbitrary Stateful Operations
-Many usecases require more advanced stateful operations than aggregations. For example, in many usecases, you have to track sessions from data streams of events. For doing such sessionization, you will have to save arbitrary types of data as state, and perform arbitrary operations on the state using the data stream events in every trigger. Since Spark 2.2, this can be done using the operation `mapGroupsWithState` and the more powerful operation `flatMapGroupsWithState`. Both operations allow you to apply user-defined code on grouped Datasets to update user-defined state. For more concrete details, take a look at the API documentation ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.GroupState)/[Java](api/java/org/apache/spark/sql/streaming/GroupState.html)) and the examples ([Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredSessionization.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java)).
+Many usecases require more advanced stateful operations than aggregations. For example, in many usecases, you have to track sessions from data streams of events. For doing such sessionization, you will have to save arbitrary types of data as state, and perform arbitrary operations on the state using the data stream events in every trigger. Since Spark 2.2, this can be done using the operation `mapGroupsWithState` and the more powerful operation `flatMapGroupsWithState`. Both operations allow you to apply user-defined code on grouped Datasets to update user-defined state. For more concrete details, take a look at the API documentation ([Scala](api/scala/org/apache/spark/sql/streaming/GroupState.html)/[Java](api/java/org/apache/spark/sql/streaming/GroupState.html)) and the examples ([Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredSessionization.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java)).
 
 Though Spark cannot check and force it, the state function should be implemented with respect to the semantics of the output mode. For example, in Update mode Spark doesn't expect that the state function will emit rows which are older than current watermark plus allowed late record delay, whereas in Append mode the state function can emit these rows.
 
@@ -1679,7 +1679,7 @@ end-to-end exactly once per query. Ensuring end-to-end exactly once for the last
 
 ## Starting Streaming Queries
 Once you have defined the final result DataFrame/Dataset, all that is left is for you to start the streaming computation. To do that, you have to use the `DataStreamWriter`
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamWriter)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamWriter.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamWriter) docs)
+([Scala](api/scala/org/apache/spark/sql/streaming/DataStreamWriter.html)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamWriter.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamWriter) docs)
 returned through `Dataset.writeStream()`. You will have to specify one or more of the following in this interface.
 
 - *Details of the output sink:* Data format, location, etc.
@@ -1863,7 +1863,7 @@ Here are the details of all the sinks in Spark.
         <code>path</code>: path to the output directory, must be specified.
         <br/><br/>
         For file-format-specific options, see the related methods in DataFrameWriter
-        (<a href="api/scala/index.html#org.apache.spark.sql.DataFrameWriter">Scala</a>/<a href="api/java/org/apache/spark/sql/DataFrameWriter.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter">Python</a>/<a
+        (<a href="api/scala/org/apache/spark/sql/DataFrameWriter.html">Scala</a>/<a href="api/java/org/apache/spark/sql/DataFrameWriter.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter">Python</a>/<a
         href="api/R/write.stream.html">R</a>).
         E.g. for "parquet" format options see <code>DataFrameWriter.parquet()</code>
     </td>
@@ -2175,7 +2175,7 @@ Since Spark 2.4, `foreach` is available in Scala, Java and Python.
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
 
-In Scala, you have to extend the class `ForeachWriter` ([docs](api/scala/index.html#org.apache.spark.sql.ForeachWriter)).
+In Scala, you have to extend the class `ForeachWriter` ([docs](api/scala/org/apache/spark/sql/ForeachWriter.html)).
 
 {% highlight scala %}
 streamingDatasetOfString.writeStream.foreach(
@@ -2564,7 +2564,7 @@ lastProgress(query)       # the most recent progress update of this streaming qu
 </div>
 
 You can start any number of queries in a single SparkSession. They will all be running concurrently sharing the cluster resources. You can use `sparkSession.streams()` to get the `StreamingQueryManager`
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryManager)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryManager.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.StreamingQueryManager) docs)
+([Scala](api/scala/org/apache/spark/sql/streaming/StreamingQueryManager.html)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryManager.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.StreamingQueryManager) docs)
 that can be used to manage the currently active queries.
 
 <div class="codetabs">
@@ -2624,7 +2624,7 @@ There are multiple ways to monitor active streaming queries. You can either push
 You can directly get the current status and metrics of an active query using 
 `streamingQuery.lastProgress()` and `streamingQuery.status()`. 
 `lastProgress()` returns a `StreamingQueryProgress` object 
-in [Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryProgress) 
+in [Scala](api/scala/org/apache/spark/sql/streaming/StreamingQueryProgress.html) 
 and [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryProgress.html)
 and a dictionary with the same fields in Python. It has all the information about
 the progress made in the last trigger of the stream - what data was processed, 
@@ -2632,7 +2632,7 @@ what were the processing rates, latencies, etc. There is also
 `streamingQuery.recentProgress` which returns an array of last few progresses.  
 
 In addition, `streamingQuery.status()` returns a `StreamingQueryStatus` object 
-in [Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryStatus) 
+in [Scala](api/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.html) 
 and [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryStatus.html)
 and a dictionary with the same fields in Python. It gives information about
 what the query is immediately doing - is a trigger active, is data being processed, etc.
@@ -2853,7 +2853,7 @@ Will print something like the following.
 
 You can also asynchronously monitor all queries associated with a
 `SparkSession` by attaching a `StreamingQueryListener`
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs).
+([Scala](api/scala/org/apache/spark/sql/streaming/StreamingQueryListener.html)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs).
 Once you attach your custom `StreamingQueryListener` object with
 `sparkSession.streams.attachListener()`, you will get callbacks when a query is started and
 stopped and when there is progress made in an active query. Here is an example,
diff --git a/docs/tuning.md b/docs/tuning.md
index 1faf7cfe0d68e..8e29e5d2e9e74 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -260,7 +260,7 @@ enough. Spark automatically sets the number of "map" tasks to run on each file a
 (though you can control it through optional parameters to `SparkContext.textFile`, etc), and for
 distributed "reduce" operations, such as `groupByKey` and `reduceByKey`, it uses the largest
 parent RDD's number of partitions. You can pass the level of parallelism as a second argument
-(see the [`spark.PairRDDFunctions`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) documentation),
+(see the [`spark.PairRDDFunctions`](api/scala/org/apache/spark/rdd/PairRDDFunctions.html) documentation),
 or set the config property `spark.default.parallelism` to change the default.
 In general, we recommend 2-3 tasks per CPU core in your cluster.
 

From 8ebbf85a85f7327a97e0c64315f266459b39dbe7 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Sun, 16 Feb 2020 10:01:49 -0600
Subject: [PATCH 056/185] [SPARK-30772][ML][SQL] avoid tuple assignment because
 it will circumvent the transient tag

### What changes were proposed in this pull request?
it is said in [LeastSquaresAggregator](https://github.com/apache/spark/blob/12e1bbaddbb2ef304b5880a62df6683fcc94ea54/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregator.scala#L188) that :

> // do not use tuple assignment above because it will circumvent the transient tag

I then check this issue with Scala 2.13.1 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_241)

### Why are the changes needed?
avoid tuple assignment because it will circumvent the transient tag

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing testsuites

Closes #27523 from zhengruifeng/avoid_tuple_assign_to_transient.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../stat/distribution/MultivariateGaussian.scala |  4 +++-
 .../spark/ml/classification/NaiveBayes.scala     | 16 ++++++++++++----
 .../stat/distribution/MultivariateGaussian.scala |  4 +++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
index e7f7a8e07d7f2..3e32f746e9cd9 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
@@ -55,7 +55,9 @@ class MultivariateGaussian @Since("2.0.0") (
    *    rootSigmaInv = D^(-1/2)^ * U.t, where sigma = U * D * U.t
    *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
    */
-  @transient private lazy val (rootSigmaInv: BDM[Double], u: Double) = calculateCovarianceConstants
+  @transient private lazy val tuple = calculateCovarianceConstants
+  @transient private lazy val rootSigmaInv = tuple._1
+  @transient private lazy val u = tuple._2
 
   /**
    * Returns density of this multivariate Gaussian at given point, x
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 94681ae9ef796..5459a0fab9135 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -405,18 +405,26 @@ class NaiveBayesModel private[ml] (
    * This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
    * application of this condition (in predict function).
    */
-  @transient private lazy val (thetaMinusNegTheta, piMinusThetaSum) = $(modelType) match {
+  @transient private lazy val thetaMinusNegTheta = $(modelType) match {
+    case Bernoulli =>
+      theta.map(value => value - math.log1p(-math.exp(value)))
+    case _ =>
+      // This should never happen.
+      throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}. " +
+        "Variables thetaMinusNegTheta should only be precomputed in Bernoulli NB.")
+  }
+
+  @transient private lazy val piMinusThetaSum = $(modelType) match {
     case Bernoulli =>
-      val thetaMinusNegTheta = theta.map(value => value - math.log1p(-math.exp(value)))
       val negTheta = theta.map(value => math.log1p(-math.exp(value)))
       val ones = new DenseVector(Array.fill(theta.numCols)(1.0))
       val piMinusThetaSum = pi.toDense.copy
       BLAS.gemv(1.0, negTheta, ones, 1.0, piMinusThetaSum)
-      (thetaMinusNegTheta, piMinusThetaSum)
+      piMinusThetaSum
     case _ =>
       // This should never happen.
       throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}. " +
-        "Variables thetaMinusNegTheta and negThetaSum should only be precomputed in Bernoulli NB.")
+        "Variables piMinusThetaSum should only be precomputed in Bernoulli NB.")
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 9a746dcf35556..f34c22915ae15 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -60,7 +60,9 @@ class MultivariateGaussian @Since("1.3.0") (
    *    rootSigmaInv = D^(-1/2)^ * U.t, where sigma = U * D * U.t
    *    u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
    */
-  @transient private lazy val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
+  @transient private lazy val tuple = calculateCovarianceConstants
+  @transient private lazy val rootSigmaInv = tuple._1
+  @transient private lazy val u = tuple._2
 
   /**
    * Returns density of this multivariate Gaussian at given point, x

From da2ca85cee3960de7a86a21483de1d77767ca060 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Mon, 17 Feb 2020 09:54:00 +0900
Subject: [PATCH 057/185] [SPARK-30703][SQL][DOCS][FOLLOWUP] Declare the ANSI
 SQL compliance options as experimental

### What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/27489.
It declares the ANSI SQL compliance options as experimental in the documentation.

### Why are the changes needed?

The options are experimental. There can be new features/behaviors in future releases.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Generating doc

Closes #27590 from gengliangwang/ExperimentalAnsi.

Authored-by: Gengliang Wang <gengliang.wang@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/sql-ref-ansi-compliance.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
index d02383518b967..267184a1cb5fa 100644
--- a/docs/sql-ref-ansi-compliance.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -19,11 +19,13 @@ license: |
   limitations under the License.
 ---
 
-Spark SQL has two options to comply with the SQL standard: `spark.sql.ansi.enabled` and `spark.sql.storeAssignmentPolicy` (See a table below for details).
+Since Spark 3.0, Spark SQL introduces two experimental options to comply with the SQL standard: `spark.sql.ansi.enabled` and `spark.sql.storeAssignmentPolicy` (See a table below for details).
+
 When `spark.sql.ansi.enabled` is set to `true`, Spark SQL follows the standard in basic behaviours (e.g., arithmetic operations, type conversion, and SQL parsing).
 Moreover, Spark SQL has an independent option to control implicit casting behaviours when inserting rows in a table.
 The casting behaviours are defined as store assignment rules in the standard.
-When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with the ANSI store assignment rules.
+
+When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with the ANSI store assignment rules. This is a separate configuration because its default value is `ANSI`, while the configuration `spark.sql.ansi.enabled` is disabled by default.
 
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -31,7 +33,7 @@ When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with
   <td><code>spark.sql.ansi.enabled</code></td>
   <td>false</td>
   <td>
-    When true, Spark tries to conform to the ANSI SQL specification:
+    (Experimental) When true, Spark tries to conform to the ANSI SQL specification:
     1. Spark will throw a runtime exception if an overflow occurs in any operation on integral/decimal field.
     2. Spark will forbid using the reserved keywords of ANSI SQL as identifiers in the SQL parser.
   </td>
@@ -40,7 +42,7 @@ When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with
   <td><code>spark.sql.storeAssignmentPolicy</code></td>
   <td>ANSI</td>
   <td>
-    When inserting a value into a column with different data type, Spark will perform type coercion.
+    (Experimental) When inserting a value into a column with different data type, Spark will perform type coercion.
     Currently, we support 3 policies for the type coercion rules: ANSI, legacy and strict. With ANSI policy,
     Spark performs the type coercion as per ANSI SQL. In practice, the behavior is mostly the same as PostgreSQL.
     It disallows certain unreasonable type conversions such as converting string to int or double to boolean.

From be3cb71e9cb34ad9054325c3122745e66e6f1ede Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 17 Feb 2020 11:06:51 +0900
Subject: [PATCH 058/185] [SPARK-30834][DOCS][PYTHON] Add note for recommended
 pandas and pyarrow versions

### What changes were proposed in this pull request?

Add doc for recommended pandas and pyarrow versions.

### Why are the changes needed?

The recommended versions are those that have been thoroughly tested by Spark CI. Other versions may be used at the discretion of the user.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

NA

Closes #27587 from BryanCutler/python-doc-rec-pandas-pyarrow-SPARK-30834-3.0.

Lead-authored-by: Bryan Cutler <cutlerb@gmail.com>
Co-authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/sql-pyspark-pandas-with-arrow.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
index 92a515746b607..63ba0ba5e1b9f 100644
--- a/docs/sql-pyspark-pandas-with-arrow.md
+++ b/docs/sql-pyspark-pandas-with-arrow.md
@@ -33,9 +33,11 @@ working with Arrow-enabled data.
 
 ### Ensure PyArrow Installed
 
+To use Apache Arrow in PySpark, [the recommended version of PyArrow](#recommended-pandas-and-pyarrow-versions)
+should be installed.
 If you install PySpark using pip, then PyArrow can be brought in as an extra dependency of the
 SQL module with the command `pip install pyspark[sql]`. Otherwise, you must ensure that PyArrow
-is installed and available on all cluster nodes. The current supported version is 0.15.1+.
+is installed and available on all cluster nodes.
 You can install using pip or conda from the conda-forge channel. See PyArrow
 [installation](https://arrow.apache.org/docs/python/install.html) for details.
 
@@ -338,6 +340,12 @@ different than a Pandas timestamp. It is recommended to use Pandas time series f
 working with timestamps in `pandas_udf`s to get the best performance, see
 [here](https://pandas.pydata.org/pandas-docs/stable/timeseries.html) for details.
 
+### Recommended Pandas and PyArrow Versions
+
+For usage with pyspark.sql, the supported versions of Pandas is 0.24.2 and PyArrow is 0.15.1. Higher
+versions may be used, however, compatibility and data correctness can not be guaranteed and should
+be verified by the user.
+
 ### Compatibility Setting for PyArrow >= 0.15.0 and Spark 2.3.x, 2.4.x
 
 Since Arrow 0.15.0, a change in the binary IPC format requires an environment variable to be

From 0353cbf092e15a09e8979070ecd5b653062b2cb5 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Mon, 17 Feb 2020 12:06:25 +0900
Subject: [PATCH 059/185] [MINOR][DOC] Fix 2 style issues in
 running-on-kubernetes doc

### What changes were proposed in this pull request?

fix style issue in the k8s document, please go to http://spark.apache.org/docs/3.0.0-preview2/running-on-kubernetes.html and search the keyword`spark.kubernetes.file.upload.path` to jump to the error context

### Why are the changes needed?

doc correctness

### Does this PR introduce any user-facing change?

Nah
### How was this patch tested?

Nah

Closes #27582 from yaooqinn/k8s-doc.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/running-on-kubernetes.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 61d6154ccb084..53b883b6bbf30 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -1120,7 +1120,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td>(none)</td>
   <td>
     Path to store files at the spark submit side in cluster mode. For example:
-    <code>spark.kubernetes.file.upload.path=s3a://<s3-bucket>/path</code>
+    <code>spark.kubernetes.file.upload.path=s3a://&lt;s3-bucket&gt;/path</code>
     File should specified as <code>file://path/to/file </code> or absolute path.
   </td>
 </tr>
@@ -1247,7 +1247,7 @@ The following affect the driver and executor containers. All other containers in
 </tr>
 <tr>
   <td>name</td>
-  <td>See description.</code></td>
+  <td>See description</td>
   <td>
     The container name will be assigned by spark ("spark-kubernetes-driver" for the driver container, and
     "executor" for each executor container) if not defined by the pod template. If the container is defined by the

From ab07c6300c884e772f88694f4b718659c45dbb33 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 17 Feb 2020 12:07:46 +0800
Subject: [PATCH 060/185] [SPARK-30799][SQL] "spark_catalog.t" should not be
 resolved to temp view

### What changes were proposed in this pull request?

No v2 command supports temp views and the `ResolveCatalogs`/`ResolveSessionCatalog` framework is designed with this assumption.

However, `ResolveSessionCatalog` needs to fallback to v1 commands, which do support temp views (e.g. CACHE TABLE). To work around it, we add a hack in `CatalogAndIdentifier`, which does not expand the given identifier with current namespace if the catalog is session catalog.

This works fine in most cases, as temp views should take precedence over tables during lookup. So if `CatalogAndIdentifier` returns a single name "t", the v1 commands can still resolve it to temp views correctly, or resolve it to table "default.t" if temp view doesn't exist.

However, if users write `spark_catalog.t`, it shouldn't be resolved to temp views as temp views don't belong to any catalog. `CatalogAndIdentifier` can't distinguish between `spark_catalog.t` and `t`, so the caller side may mistakenly resolve `spark_catalog.t` to a temp view.

This PR proposes to fix this issue by
1. remove the hack in `CatalogAndIdentifier`, and clearly document that this shouldn't be used to resolve temp views.
2. update `ResolveSessionCatalog` to explicitly look up temp views first before calling `CatalogAndIdentifier`, for v1 commands that support temp views.

### Why are the changes needed?

To avoid releasing a behavior that we should not support.

Removing the hack also fixes the problem we hit in https://github.com/apache/spark/pull/27532/files#diff-57b3d87be744b7d79a9beacf8e5e5eb2R937

### Does this PR introduce any user-facing change?

yes, now it's not allowed to refer to a temp view with `spark_catalog` prefix.

### How was this patch tested?

new tests

Closes #27550 from cloud-fan/ns.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/catalog/SessionCatalog.scala |   4 +
 .../sql/connector/catalog/LookupCatalog.scala |  15 +-
 .../catalog/LookupCatalogSuite.scala          |   1 +
 .../analysis/ResolveSessionCatalog.scala      | 180 ++++++++++++------
 .../internal/BaseSessionStateBuilder.scala    |   3 +-
 .../sql-tests/results/describe.sql.out        |  12 +-
 .../sql/connector/DataSourceV2SQLSuite.scala  |  32 +++-
 .../command/PlanResolutionSuite.scala         |   3 +-
 .../sql/hive/HiveSessionStateBuilder.scala    |   3 +-
 9 files changed, 176 insertions(+), 77 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 12f9a61fc2b65..c80d9d2d7c183 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -1344,6 +1344,10 @@ class SessionCatalog(
       !hiveFunctions.contains(name.funcName.toLowerCase(Locale.ROOT))
   }
 
+  def isTempFunction(name: String): Boolean = {
+    isTemporaryFunction(FunctionIdentifier(name))
+  }
+
   /**
    * Return whether this function has been registered in the function registry of the current
    * session. If not existed, return false.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala
index 080ddf1d027e9..b0b9d7b15cef3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/LookupCatalog.scala
@@ -94,6 +94,10 @@ private[sql] trait LookupCatalog extends Logging {
    * Extract catalog and identifier from a multi-part name with the current catalog if needed.
    * Catalog name takes precedence over identifier, but for a single-part name, identifier takes
    * precedence over catalog name.
+   *
+   * Note that, this pattern is used to look up permanent catalog objects like table, view,
+   * function, etc. If you need to look up temp objects like temp view, please do it separately
+   * before calling this pattern, as temp objects don't belong to any catalog.
    */
   object CatalogAndIdentifier {
     import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
@@ -103,16 +107,7 @@ private[sql] trait LookupCatalog extends Logging {
     def unapply(nameParts: Seq[String]): Option[(CatalogPlugin, Identifier)] = {
       assert(nameParts.nonEmpty)
       if (nameParts.length == 1) {
-        // If the current catalog is session catalog, the current namespace is not used because
-        // the single-part name could be referencing a temp view, which doesn't belong to any
-        // namespaces. An empty namespace will be resolved inside the session catalog
-        // implementation when a relation is looked up.
-        val ns = if (CatalogV2Util.isSessionCatalog(currentCatalog)) {
-          Array.empty[String]
-        } else {
-          catalogManager.currentNamespace
-        }
-        Some((currentCatalog, Identifier.of(ns, nameParts.head)))
+        Some((currentCatalog, Identifier.of(catalogManager.currentNamespace, nameParts.head)))
       } else if (nameParts.head.equalsIgnoreCase(globalTempDB)) {
         // Conceptually global temp views are in a special reserved catalog. However, the v2 catalog
         // API does not support view yet, and we have to use v1 commands to deal with global temp
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/LookupCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/LookupCatalogSuite.scala
index b2f27e4740cbe..b36ded3d9d2ac 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/LookupCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/LookupCatalogSuite.scala
@@ -50,6 +50,7 @@ class LookupCatalogSuite extends SparkFunSuite with LookupCatalog with Inside {
     })
     when(manager.currentCatalog).thenReturn(sessionCatalog)
     when(manager.v2SessionCatalog).thenReturn(sessionCatalog)
+    when(manager.currentNamespace).thenReturn(Array.empty[String])
     manager
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
index adeb2164eff63..6f9e666012a09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, Identifier, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table}
+import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table}
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, RefreshTable}
@@ -39,7 +39,8 @@ import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBui
 class ResolveSessionCatalog(
     val catalogManager: CatalogManager,
     conf: SQLConf,
-    isView: Seq[String] => Boolean)
+    isTempView: Seq[String] => Boolean,
+    isTempFunction: String => Boolean)
   extends Rule[LogicalPlan] with LookupCatalog {
   import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
   import org.apache.spark.sql.connector.catalog.CatalogV2Util._
@@ -238,8 +239,9 @@ class ResolveSessionCatalog(
       }
       AlterDatabaseSetLocationCommand(ns.head, location)
 
-    case RenameTableStatement(SessionCatalogAndTable(_, oldName), newNameParts, isView) =>
-      AlterTableRenameCommand(oldName.asTableIdentifier, newNameParts.asTableIdentifier, isView)
+    // v1 RENAME TABLE supports temp view.
+    case RenameTableStatement(TempViewOrV1Table(oldName), newName, isView) =>
+      AlterTableRenameCommand(oldName.asTableIdentifier, newName.asTableIdentifier, isView)
 
     case DescribeRelation(ResolvedTable(_, ident, _: V1Table), partitionSpec, isExtended) =>
       DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended)
@@ -251,10 +253,12 @@ class ResolveSessionCatalog(
     case DescribeColumnStatement(
         SessionCatalogAndTable(catalog, tbl), colNameParts, isExtended) =>
       loadTable(catalog, tbl.asIdentifier).collect {
+        // `V1Table` also includes permanent views.
         case v1Table: V1Table =>
           DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended)
       }.getOrElse {
-        if (isView(tbl)) {
+        if (isTempView(tbl)) {
+          // v1 DESCRIBE COLUMN supports temp view.
           DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended)
         } else {
           throw new AnalysisException("Describing columns is not supported for v2 tables.")
@@ -302,8 +306,9 @@ class ResolveSessionCatalog(
           ignoreIfExists = c.ifNotExists)
       }
 
-    case RefreshTableStatement(SessionCatalogAndTable(_, tbl)) =>
-      RefreshTable(tbl.asTableIdentifier)
+    // v1 REFRESH TABLE supports temp view.
+    case RefreshTableStatement(TempViewOrV1Table(name)) =>
+      RefreshTable(name.asTableIdentifier)
 
     // For REPLACE TABLE [AS SELECT], we should fail if the catalog is resolved to the
     // session catalog and the table provider is not v2.
@@ -338,11 +343,13 @@ class ResolveSessionCatalog(
           orCreate = c.orCreate)
       }
 
-    case DropTableStatement(SessionCatalogAndTable(catalog, tbl), ifExists, purge) =>
-      DropTableCommand(tbl.asTableIdentifier, ifExists, isView = false, purge = purge)
+    // v1 DROP TABLE supports temp view.
+    case DropTableStatement(TempViewOrV1Table(name), ifExists, purge) =>
+      DropTableCommand(name.asTableIdentifier, ifExists, isView = false, purge = purge)
 
-    case DropViewStatement(SessionCatalogAndTable(catalog, viewName), ifExists) =>
-      DropTableCommand(viewName.asTableIdentifier, ifExists, isView = true, purge = false)
+    // v1 DROP TABLE supports temp view.
+    case DropViewStatement(TempViewOrV1Table(name), ifExists) =>
+      DropTableCommand(name.asTableIdentifier, ifExists, isView = true, purge = false)
 
     case c @ CreateNamespaceStatement(CatalogAndNamespace(catalog, ns), _, _)
         if isSessionCatalog(catalog) =>
@@ -416,12 +423,18 @@ class ResolveSessionCatalog(
       ShowCreateTableAsSerdeCommand(v1TableName.asTableIdentifier)
 
     case CacheTableStatement(tbl, plan, isLazy, options) =>
-      val v1TableName = parseV1Table(tbl, "CACHE TABLE")
-      CacheTableCommand(v1TableName.asTableIdentifier, plan, isLazy, options)
+      val name = if (plan.isDefined) {
+        // CACHE TABLE ... AS SELECT creates a temp view with the input query.
+        // Temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name.
+        tbl
+      } else {
+        parseTempViewOrV1Table(tbl, "CACHE TABLE")
+      }
+      CacheTableCommand(name.asTableIdentifier, plan, isLazy, options)
 
     case UncacheTableStatement(tbl, ifExists) =>
-      val v1TableName = parseV1Table(tbl, "UNCACHE TABLE")
-      UncacheTableCommand(v1TableName.asTableIdentifier, ifExists)
+      val name = parseTempViewOrV1Table(tbl, "UNCACHE TABLE")
+      UncacheTableCommand(name.asTableIdentifier, ifExists)
 
     case TruncateTableStatement(tbl, partitionSpec) =>
       val v1TableName = parseV1Table(tbl, "TRUNCATE TABLE")
@@ -450,10 +463,6 @@ class ResolveSessionCatalog(
         throw new AnalysisException(
           s"Namespace name should have only one part if specified: ${ns.get.quoted}")
       }
-      if (tbl.length > 2) {
-        throw new AnalysisException(
-          s"Table name should have at most two parts: ${tbl.quoted}")
-      }
       ShowColumnsCommand(db, v1TableName)
 
     case AlterTableRecoverPartitionsStatement(tbl) =>
@@ -493,10 +502,10 @@ class ResolveSessionCatalog(
         serdeProperties,
         partitionSpec)
 
-    case AlterViewAsStatement(tbl, originalText, query) =>
-      val v1TableName = parseV1Table(tbl, "ALTER VIEW QUERY")
+    case AlterViewAsStatement(name, originalText, query) =>
+      val viewName = parseTempViewOrV1Table(name, "ALTER VIEW QUERY")
       AlterViewAsCommand(
-        v1TableName.asTableIdentifier,
+        viewName.asTableIdentifier,
         originalText,
         query)
 
@@ -504,7 +513,12 @@ class ResolveSessionCatalog(
       tbl, userSpecifiedColumns, comment, properties,
       originalText, child, allowExisting, replace, viewType) =>
 
-      val v1TableName = parseV1Table(tbl, "CREATE VIEW")
+      val v1TableName = if (viewType != PersistedView) {
+        // temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name.
+        tbl
+      } else {
+        parseV1Table(tbl, "CREATE VIEW")
+      }
       CreateViewCommand(
         v1TableName.asTableIdentifier,
         userSpecifiedColumns,
@@ -519,56 +533,94 @@ class ResolveSessionCatalog(
     case ShowTableProperties(r: ResolvedTable, propertyKey) if isSessionCatalog(r.catalog) =>
       ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey)
 
-    case DescribeFunctionStatement(CatalogAndIdentifier(catalog, ident), extended) =>
+    case DescribeFunctionStatement(nameParts, extended) =>
       val functionIdent =
-        parseSessionCatalogFunctionIdentifier("DESCRIBE FUNCTION", catalog, ident)
+        parseSessionCatalogFunctionIdentifier(nameParts, "DESCRIBE FUNCTION")
       DescribeFunctionCommand(functionIdent, extended)
 
     case ShowFunctionsStatement(userScope, systemScope, pattern, fun) =>
       val (database, function) = fun match {
-        case Some(CatalogAndIdentifier(catalog, ident)) =>
+        case Some(nameParts) =>
           val FunctionIdentifier(fn, db) =
-            parseSessionCatalogFunctionIdentifier("SHOW FUNCTIONS", catalog, ident)
+            parseSessionCatalogFunctionIdentifier(nameParts, "SHOW FUNCTIONS")
           (db, Some(fn))
         case None => (None, pattern)
       }
       ShowFunctionsCommand(database, function, userScope, systemScope)
 
-    case DropFunctionStatement(CatalogAndIdentifier(catalog, ident), ifExists, isTemp) =>
+    case DropFunctionStatement(nameParts, ifExists, isTemp) =>
       val FunctionIdentifier(function, database) =
-        parseSessionCatalogFunctionIdentifier("DROP FUNCTION", catalog, ident)
+        parseSessionCatalogFunctionIdentifier(nameParts, "DROP FUNCTION")
       DropFunctionCommand(database, function, ifExists, isTemp)
 
-    case CreateFunctionStatement(CatalogAndIdentifier(catalog, ident),
+    case CreateFunctionStatement(nameParts,
       className, resources, isTemp, ignoreIfExists, replace) =>
-      val FunctionIdentifier(function, database) =
-        parseSessionCatalogFunctionIdentifier("CREATE FUNCTION", catalog, ident)
-      CreateFunctionCommand(database, function, className, resources, isTemp, ignoreIfExists,
-        replace)
+      if (isTemp) {
+        // temp func doesn't belong to any catalog and we shouldn't resolve catalog in the name.
+        val database = if (nameParts.length > 2) {
+          throw new AnalysisException(s"Unsupported function name '${nameParts.quoted}'")
+        } else if (nameParts.length == 2) {
+          Some(nameParts.head)
+        } else {
+          None
+        }
+        CreateFunctionCommand(
+          database,
+          nameParts.last,
+          className,
+          resources,
+          isTemp,
+          ignoreIfExists,
+          replace)
+      } else {
+        val FunctionIdentifier(function, database) =
+          parseSessionCatalogFunctionIdentifier(nameParts, "CREATE FUNCTION")
+        CreateFunctionCommand(database, function, className, resources, isTemp, ignoreIfExists,
+          replace)
+      }
   }
 
+  // TODO: move function related v2 statements to the new framework.
   private def parseSessionCatalogFunctionIdentifier(
-      sql: String,
-      catalog: CatalogPlugin,
-      functionIdent: Identifier): FunctionIdentifier = {
-    if (isSessionCatalog(catalog)) {
-      functionIdent.asMultipartIdentifier match {
-        case Seq(db, fn) => FunctionIdentifier(fn, Some(db))
-        case Seq(fn) => FunctionIdentifier(fn, None)
-        case _ =>
-          throw new AnalysisException(s"Unsupported function name '${functionIdent.quoted}'")
-      }
-    } else {
-      throw new AnalysisException(s"$sql is only supported in v1 catalog")
+      nameParts: Seq[String],
+      sql: String): FunctionIdentifier = {
+    if (nameParts.length == 1 && isTempFunction(nameParts.head)) {
+      return FunctionIdentifier(nameParts.head)
     }
-  }
 
-  private def parseV1Table(tableName: Seq[String], sql: String): Seq[String] = {
-    val CatalogAndIdentifier(catalog, ident) = tableName
-    if (!isSessionCatalog(catalog)) {
-      throw new AnalysisException(s"$sql is only supported with v1 tables.")
+    nameParts match {
+      case SessionCatalogAndIdentifier(_, ident) =>
+        if (nameParts.length == 1) {
+          // If there is only one name part, it means the current catalog is the session catalog.
+          // Here we don't fill the default database, to keep the error message unchanged for
+          // v1 commands.
+          FunctionIdentifier(nameParts.head, None)
+        } else {
+          ident.namespace match {
+            // For name parts like `spark_catalog.t`, we need to fill in the default database so
+            // that the caller side won't treat it as a temp function.
+            case Array() if nameParts.head == CatalogManager.SESSION_CATALOG_NAME =>
+              FunctionIdentifier(
+                ident.name, Some(catalogManager.v1SessionCatalog.getCurrentDatabase))
+            case Array(db) => FunctionIdentifier(ident.name, Some(db))
+            case _ =>
+              throw new AnalysisException(s"Unsupported function name '$ident'")
+          }
+        }
+
+      case _ => throw new AnalysisException(s"$sql is only supported in v1 catalog")
     }
-    ident.asMultipartIdentifier
+  }
+
+  private def parseV1Table(tableName: Seq[String], sql: String): Seq[String] = tableName match {
+    case SessionCatalogAndTable(_, tbl) => tbl
+    case _ => throw new AnalysisException(s"$sql is only supported with v1 tables.")
+  }
+
+  private def parseTempViewOrV1Table(
+      nameParts: Seq[String], sql: String): Seq[String] = nameParts match {
+    case TempViewOrV1Table(name) => name
+    case _ => throw new AnalysisException(s"$sql is only supported with temp views or v1 tables.")
   }
 
   private def buildCatalogTable(
@@ -607,7 +659,29 @@ class ResolveSessionCatalog(
   object SessionCatalogAndTable {
     def unapply(nameParts: Seq[String]): Option[(CatalogPlugin, Seq[String])] = nameParts match {
       case SessionCatalogAndIdentifier(catalog, ident) =>
-        Some(catalog -> ident.asMultipartIdentifier)
+        if (nameParts.length == 1) {
+          // If there is only one name part, it means the current catalog is the session catalog.
+          // Here we return the original name part, to keep the error message unchanged for
+          // v1 commands.
+          Some(catalog -> nameParts)
+        } else {
+          Some(catalog -> ident.asMultipartIdentifier)
+        }
+      case _ => None
+    }
+  }
+
+  object TempViewOrV1Table {
+    def unapply(nameParts: Seq[String]): Option[Seq[String]] = nameParts match {
+      case _ if isTempView(nameParts) => Some(nameParts)
+      case SessionCatalogAndTable(_, tbl) =>
+        if (nameParts.head == CatalogManager.SESSION_CATALOG_NAME && tbl.length == 1) {
+          // For name parts like `spark_catalog.t`, we need to fill in the default database so
+          // that the caller side won't treat it as a temp view.
+          Some(Seq(catalogManager.v1SessionCatalog.getCurrentDatabase, tbl.head))
+        } else {
+          Some(tbl)
+        }
       case _ => None
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index eb658e2d8850e..2137fe2d0a8ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -174,7 +174,8 @@ abstract class BaseSessionStateBuilder(
       new FindDataSourceTable(session) +:
         new ResolveSQLOnFile(session) +:
         new FallBackFileSourceV2(session) +:
-        new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +:
+        new ResolveSessionCatalog(
+          catalogManager, conf, catalog.isTempView, catalog.isTempFunction) +:
         customResolutionRules
 
     override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index 697e006544acf..a7ef828e4d5d2 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -520,7 +520,7 @@ struct<plan:string>
 -- !query output
 == Physical Plan ==
 Execute DescribeTableCommand
-   +- DescribeTableCommand `t`, false
+   +- DescribeTableCommand `default`.`t`, false
 
 
 -- !query
@@ -530,7 +530,7 @@ struct<plan:string>
 -- !query output
 == Physical Plan ==
 Execute DescribeTableCommand
-   +- DescribeTableCommand `t`, true
+   +- DescribeTableCommand `default`.`t`, true
 
 
 -- !query
@@ -544,14 +544,14 @@ struct<plan:string>
 
 == Analyzed Logical Plan ==
 col_name: string, data_type: string, comment: string
-DescribeTableCommand `t`, false
+DescribeTableCommand `default`.`t`, false
 
 == Optimized Logical Plan ==
-DescribeTableCommand `t`, false
+DescribeTableCommand `default`.`t`, false
 
 == Physical Plan ==
 Execute DescribeTableCommand
-   +- DescribeTableCommand `t`, false
+   +- DescribeTableCommand `default`.`t`, false
 
 
 -- !query
@@ -571,7 +571,7 @@ struct<plan:string>
 -- !query output
 == Physical Plan ==
 Execute DescribeTableCommand
-   +- DescribeTableCommand `t`, Map(c -> Us, d -> 2), false
+   +- DescribeTableCommand `default`.`t`, Map(c -> Us, d -> 2), false
 
 
 -- !query
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index eabcb81c50646..2acd7e39fe949 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -1859,12 +1859,12 @@ class DataSourceV2SQLSuite
     withTable(t) {
       spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo")
 
-      testV1Command("CACHE TABLE", t)
+      testV1CommandSupportingTempView("CACHE TABLE", t)
 
       val e = intercept[AnalysisException] {
         sql(s"CACHE LAZY TABLE $t")
       }
-      assert(e.message.contains("CACHE TABLE is only supported with v1 tables"))
+      assert(e.message.contains("CACHE TABLE is only supported with temp views or v1 tables"))
     }
   }
 
@@ -1873,8 +1873,8 @@ class DataSourceV2SQLSuite
     withTable(t) {
       sql(s"CREATE TABLE $t (id bigint, data string) USING foo")
 
-      testV1Command("UNCACHE TABLE", t)
-      testV1Command("UNCACHE TABLE", s"IF EXISTS $t")
+      testV1CommandSupportingTempView("UNCACHE TABLE", t)
+      testV1CommandSupportingTempView("UNCACHE TABLE", s"IF EXISTS $t")
     }
   }
 
@@ -1954,7 +1954,7 @@ class DataSourceV2SQLSuite
     val e = intercept[AnalysisException] {
       sql(s"ALTER VIEW $v AS SELECT 1")
     }
-    assert(e.message.contains("ALTER VIEW QUERY is only supported with v1 tables"))
+    assert(e.message.contains("ALTER VIEW QUERY is only supported with temp views or v1 tables"))
   }
 
   test("CREATE VIEW") {
@@ -2252,6 +2252,21 @@ class DataSourceV2SQLSuite
       .head().getString(1) === expectedComment)
   }
 
+  test("SPARK-30799: temp view name can't contain catalog name") {
+    val sessionCatalogName = CatalogManager.SESSION_CATALOG_NAME
+    withTempView("v") {
+      spark.range(10).createTempView("v")
+      val e1 = intercept[AnalysisException](
+        sql(s"CACHE TABLE $sessionCatalogName.v")
+      )
+      assert(e1.message.contains("Table or view not found: default.v"))
+    }
+    val e2 = intercept[AnalysisException] {
+      sql(s"CREATE TEMP VIEW $sessionCatalogName.v AS SELECT 1")
+    }
+    assert(e2.message.contains("It is not allowed to add database prefix"))
+  }
+
   private def testV1Command(sqlCommand: String, sqlParams: String): Unit = {
     val e = intercept[AnalysisException] {
       sql(s"$sqlCommand $sqlParams")
@@ -2259,6 +2274,13 @@ class DataSourceV2SQLSuite
     assert(e.message.contains(s"$sqlCommand is only supported with v1 tables"))
   }
 
+  private def testV1CommandSupportingTempView(sqlCommand: String, sqlParams: String): Unit = {
+    val e = intercept[AnalysisException] {
+      sql(s"$sqlCommand $sqlParams")
+    }
+    assert(e.message.contains(s"$sqlCommand is only supported with temp views or v1 tables"))
+  }
+
   private def assertAnalysisError(sqlStatement: String, expectedError: String): Unit = {
     val errMsg = intercept[AnalysisException] {
       sql(sqlStatement)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
index d439e5b1cd651..21c698c4a2c4d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
@@ -128,6 +128,7 @@ class PlanResolutionSuite extends AnalysisTest {
       }
     })
     when(manager.currentCatalog).thenReturn(v2SessionCatalog)
+    when(manager.currentNamespace).thenReturn(Array.empty[String])
     when(manager.v1SessionCatalog).thenReturn(v1SessionCatalog)
     manager
   }
@@ -145,7 +146,7 @@ class PlanResolutionSuite extends AnalysisTest {
       ResolveInlineTables(conf),
       analyzer.ResolveRelations,
       new ResolveCatalogs(catalogManager),
-      new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v")),
+      new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v"), _ => false),
       analyzer.ResolveTables,
       analyzer.ResolveReferences,
       analyzer.ResolveSubqueryColumnAliases,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index b117c582a3e6e..8020492ff759d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -75,7 +75,8 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
         new FindDataSourceTable(session) +:
         new ResolveSQLOnFile(session) +:
         new FallBackFileSourceV2(session) +:
-        new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +:
+        new ResolveSessionCatalog(
+          catalogManager, conf, catalog.isTempView, catalog.isTempFunction) +:
         customResolutionRules
 
     override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =

From 619274ed363ea4beb1d2cd10f24988b0dc57ef80 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 17 Feb 2020 12:40:52 +0800
Subject: [PATCH 061/185] [DOC] add config naming guideline

### What changes were proposed in this pull request?

Add docs to describe the config naming guideline.

### Why are the changes needed?

To encourage contributors to name configs more consistently.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #27577 from cloud-fan/config.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/internal/config/ConfigEntry.scala   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
index c5df4c8820098..1fa4f809e11ec 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
@@ -17,6 +17,35 @@
 
 package org.apache.spark.internal.config
 
+// ====================================================================================
+//                      The guideline for naming configurations
+// ====================================================================================
+/*
+In general, the config name should be a noun that describes its basic purpose. It's
+recommended to add prefix to the config name to make the scope clearer. For example,
+`spark.scheduler.mode` clearly indicates that this config is for the scheduler.
+
+A config name can have multiple prefixes that are structured, which is similar to a
+qualified Java class name. Each prefix behaves like a namespace. We should only create
+a namespace if it's meaningful and can be shared by multiple configs. For example,
+`buffer.inMemoryThreshold` is preferred over `buffer.in.memory.threshold`.
+
+The followings are best practices of naming configs for some common cases:
+1. When adding configs for a big feature, it's better to create an umbrella config that
+   can turn the feature on/off, with a name like `featureName.enabled`. The other configs
+   of this feature should be put under the `featureName` namespace. For example:
+     - spark.sql.cbo.enabled
+     - spark.sql.cbo.starSchemaDetection
+     - spark.sql.cbo.joinReorder.enabled
+     - spark.sql.cbo.joinReorder.dp.threshold
+2. When adding a boolean config, the name should be a verb that describes what
+   happens if this config is set to true, e.g. `spark.shuffle.sort.useRadixSort`.
+3. When adding a config to specify a time duration, it's better to put the time unit
+   in the config name. For example, `featureName.timeoutMs`, which clearly indicates
+   that the time unit is millisecond. The config entry should be created with
+   `ConfigBuilder#timeConf`, to support time strings like `2 minutes`.
+*/
+
 /**
  * An entry contains all meta information for a configuration.
  *

From 9107f77f15cd0630dc981b6e8a9ca696b79e624f Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Mon, 17 Feb 2020 13:59:21 +0800
Subject: [PATCH 062/185] [SPARK-30843][SQL] Fix getting of time components
 before 1582 year

### What changes were proposed in this pull request?

1. Rewrite DateTimeUtils methods `getHours()`, `getMinutes()`, `getSeconds()`, `getSecondsWithFraction()`, `getMilliseconds()` and `getMicroseconds()` using Java 8 time APIs. This will automatically switch the `Hour`, `Minute`, `Second` and `DatePart` expressions on Proleptic Gregorian calendar.
2. Remove unused methods and constant of DateTimeUtils - `to2001`, `YearZero `, `toYearZero` and `absoluteMicroSecond()`.
3. Remove unused value `timeZone` from `TimeZoneAwareExpression` since all expressions have been migrated to Java 8 time API, and legacy instance of `TimeZone` is not needed any more.
4. Change signatures of modified DateTimeUtils methods, and pass `ZoneId` instead of `TimeZone`. This will allow to avoid unnecessary conversions `TimeZone` -> `String` -> `ZoneId`.
5. Modify tests in `DateTimeUtilsSuite` and in `DateExpressionsSuite` to pass `ZoneId` instead of `TimeZone`. Correct the tests, to pass tested zone id instead of None.

### Why are the changes needed?
The changes fix the issue of wrong results returned by the `hour()`, `minute()`, `second()`, `date_part('millisecond', ...)` and `date_part('microsecond', ....)`, see example in [SPARK-30843](https://issues.apache.org/jira/browse/SPARK-30843).

### Does this PR introduce any user-facing change?
Yes. After the changes, the results of examples from SPARK-30843:
```sql
spark-sql> select hour(timestamp '0010-01-01 00:00:00');
0
spark-sql> select minute(timestamp '0010-01-01 00:00:00');
0
spark-sql> select second(timestamp '0010-01-01 00:00:00');
0
spark-sql> select date_part('milliseconds', timestamp '0010-01-01 00:00:00');
0.000
spark-sql> select date_part('microseconds', timestamp '0010-01-01 00:00:00');
0
```

### How was this patch tested?
- By existing test suites `DateTimeUtilsSuite`, `DateExpressionsSuite` and `DateFunctionsSuite`.
- Add new tests to `DateExpressionsSuite` and `DateTimeUtilsSuite` for 10 year, like:
```scala
  input = date(10, 1, 1, 0, 0, 0, 0, zonePST)
  assert(getHours(input, zonePST) === 0)
```
- Re-run `DateTimeBenchmark` using Amazon EC2.

| Item | Description |
| ---- | ----|
| Region | us-west-2 (Oregon) |
| Instance | r3.xlarge |
| AMI | ami-06f2f779464715dc5 (ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1) |
| Java | OpenJDK8/11 |

Closes #27596 from MaxGekk/localtimestamp-greg-cal.

Lead-authored-by: Maxim Gekk <max.gekk@gmail.com>
Co-authored-by: Max Gekk <max.gekk@gmail.com>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-1-30.us-west-2.compute.internal>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/datetimeExpressions.scala     |  37 +-
 .../sql/catalyst/util/DateTimeUtils.scala     |  44 +--
 .../expressions/DateExpressionsSuite.scala    |  39 ++-
 .../sql/catalyst/util/DateTimeTestUtils.scala |  18 +-
 .../catalyst/util/DateTimeUtilsSuite.scala    | 173 +++++-----
 .../DateTimeBenchmark-jdk11-results.txt       | 326 +++++++++---------
 .../benchmarks/DateTimeBenchmark-results.txt  | 326 +++++++++---------
 7 files changed, 484 insertions(+), 479 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index adf7251256041..05074d97efa44 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -51,7 +51,6 @@ trait TimeZoneAwareExpression extends Expression {
   /** Returns a copy of this expression with the specified timeZoneId. */
   def withTimeZone(timeZoneId: String): TimeZoneAwareExpression
 
-  @transient lazy val timeZone: TimeZone = DateTimeUtils.getTimeZone(timeZoneId.get)
   @transient lazy val zoneId: ZoneId = DateTimeUtils.getZoneId(timeZoneId.get)
 }
 
@@ -229,13 +228,13 @@ case class Hour(child: Expression, timeZoneId: Option[String] = None)
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getHours(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getHours(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getHours($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getHours($c, $zid)")
   }
 }
 
@@ -260,13 +259,13 @@ case class Minute(child: Expression, timeZoneId: Option[String] = None)
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getMinutes(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getMinutes(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getMinutes($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getMinutes($c, $zid)")
   }
 }
 
@@ -291,13 +290,13 @@ case class Second(child: Expression, timeZoneId: Option[String] = None)
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getSeconds(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getSeconds(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getSeconds($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getSeconds($c, $zid)")
   }
 }
 
@@ -315,13 +314,13 @@ case class SecondWithFraction(child: Expression, timeZoneId: Option[String] = No
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getSecondsWithFraction(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getSecondsWithFraction(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getSecondsWithFraction($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getSecondsWithFraction($c, $zid)")
   }
 }
 
@@ -338,13 +337,13 @@ case class Milliseconds(child: Expression, timeZoneId: Option[String] = None)
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getMilliseconds(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getMilliseconds(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getMilliseconds($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getMilliseconds($c, $zid)")
   }
 }
 
@@ -357,13 +356,13 @@ case class Microseconds(child: Expression, timeZoneId: Option[String] = None)
     copy(timeZoneId = Option(timeZoneId))
 
   override protected def nullSafeEval(timestamp: Any): Any = {
-    DateTimeUtils.getMicroseconds(timestamp.asInstanceOf[Long], timeZone)
+    DateTimeUtils.getMicroseconds(timestamp.asInstanceOf[Long], zoneId)
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c => s"$dtu.getMicroseconds($c, $tz)")
+    defineCodeGen(ctx, ev, c => s"$dtu.getMicroseconds($c, $zid)")
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index dcc7337116777..d9a25cf2b10f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -47,12 +47,6 @@ object DateTimeUtils {
   // it's 2440587.5, rounding up to compatible with Hive
   final val JULIAN_DAY_OF_EPOCH = 2440588
 
-  // number of days between 1.1.1970 and 1.1.2001
-  final val to2001 = -11323
-
-  // this is year -17999, calculation: 50 * daysIn400Year
-  final val YearZero = -17999
-  final val toYearZero = to2001 + 7304850
   final val TimeZoneGMT = TimeZone.getTimeZone("GMT")
   final val TimeZoneUTC = TimeZone.getTimeZone("UTC")
 
@@ -413,63 +407,55 @@ object DateTimeUtils {
     }
   }
 
-  /**
-   * Returns the microseconds since year zero (-17999) from microseconds since epoch.
-   */
-  private def absoluteMicroSecond(microsec: SQLTimestamp): SQLTimestamp = {
-    microsec + toYearZero * MICROS_PER_DAY
-  }
-
-  private def localTimestamp(microsec: SQLTimestamp, timeZone: TimeZone): SQLTimestamp = {
-    val zoneOffsetUs = MILLISECONDS.toMicros(timeZone.getOffset(MICROSECONDS.toMillis(microsec)))
-    absoluteMicroSecond(microsec) + zoneOffsetUs
+  private def localTimestamp(microsec: SQLTimestamp, zoneId: ZoneId): LocalDateTime = {
+    microsToInstant(microsec).atZone(zoneId).toLocalDateTime
   }
 
   /**
    * Returns the hour value of a given timestamp value. The timestamp is expressed in microseconds.
    */
-  def getHours(microsec: SQLTimestamp, timeZone: TimeZone): Int = {
-    (MICROSECONDS.toHours(localTimestamp(microsec, timeZone)) % 24).toInt
+  def getHours(microsec: SQLTimestamp, zoneId: ZoneId): Int = {
+    localTimestamp(microsec, zoneId).getHour
   }
 
   /**
    * Returns the minute value of a given timestamp value. The timestamp is expressed in
    * microseconds.
    */
-  def getMinutes(microsec: SQLTimestamp, timeZone: TimeZone): Int = {
-    (MICROSECONDS.toMinutes(localTimestamp(microsec, timeZone)) % 60).toInt
+  def getMinutes(microsec: SQLTimestamp, zoneId: ZoneId): Int = {
+    localTimestamp(microsec, zoneId).getMinute
   }
 
   /**
    * Returns the second value of a given timestamp value. The timestamp is expressed in
    * microseconds.
    */
-  def getSeconds(microsec: SQLTimestamp, timeZone: TimeZone): Int = {
-    (MICROSECONDS.toSeconds(localTimestamp(microsec, timeZone)) % 60).toInt
+  def getSeconds(microsec: SQLTimestamp, zoneId: ZoneId): Int = {
+    localTimestamp(microsec, zoneId).getSecond
   }
 
   /**
    * Returns the seconds part and its fractional part with microseconds.
    */
-  def getSecondsWithFraction(microsec: SQLTimestamp, timeZone: TimeZone): Decimal = {
-    val secFrac = localTimestamp(microsec, timeZone) % (MILLIS_PER_MINUTE * MICROS_PER_MILLIS)
-    Decimal(secFrac, 8, 6)
+  def getSecondsWithFraction(microsec: SQLTimestamp, zoneId: ZoneId): Decimal = {
+    Decimal(getMicroseconds(microsec, zoneId), 8, 6)
   }
 
   /**
    * Returns seconds, including fractional parts, multiplied by 1000. The timestamp
    * is expressed in microseconds since the epoch.
    */
-  def getMilliseconds(timestamp: SQLTimestamp, timeZone: TimeZone): Decimal = {
-    Decimal(getMicroseconds(timestamp, timeZone), 8, 3)
+  def getMilliseconds(timestamp: SQLTimestamp, zoneId: ZoneId): Decimal = {
+    Decimal(getMicroseconds(timestamp, zoneId), 8, 3)
   }
 
   /**
    * Returns seconds, including fractional parts, multiplied by 1000000. The timestamp
    * is expressed in microseconds since the epoch.
    */
-  def getMicroseconds(timestamp: SQLTimestamp, timeZone: TimeZone): Int = {
-    Math.floorMod(localTimestamp(timestamp, timeZone), MICROS_PER_SECOND * 60).toInt
+  def getMicroseconds(timestamp: SQLTimestamp, zoneId: ZoneId): Int = {
+    val lt = localTimestamp(timestamp, zoneId)
+    (lt.getLong(ChronoField.MICRO_OF_SECOND) + lt.getSecond * MICROS_PER_SECOND).toInt
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 39b859af47ca9..a514c902f1d1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -1060,22 +1060,31 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     outstandingTimezonesIds.foreach { timezone =>
       var timestamp = MakeTimestamp(Literal(2019), Literal(8), Literal(10),
         Literal(0), Literal(0), Literal(Decimal(BigDecimal(10.123456789), 8, 6)),
-        Some(Literal(timezone)))
+        Some(Literal(timezone)), Some(timezone))
+      def millis(ts: MakeTimestamp): Milliseconds = Milliseconds(timestamp, Some(timezone))
+      def micros(ts: MakeTimestamp): Microseconds = Microseconds(timestamp, Some(timezone))
 
-      checkEvaluation(Milliseconds(timestamp), Decimal(BigDecimal(10123.457), 8, 3))
-      checkEvaluation(Microseconds(timestamp), 10123457)
+      checkEvaluation(millis(timestamp), Decimal(BigDecimal(10123.457), 8, 3))
+      checkEvaluation(
+        millis(timestamp.copy(year = Literal(10))),
+        Decimal(BigDecimal(10123.457), 8, 3))
+
+      checkEvaluation(micros(timestamp), 10123457)
+      checkEvaluation(
+        micros(timestamp.copy(year = Literal(10))),
+        10123457)
 
       timestamp = timestamp.copy(sec = Literal(Decimal(0.0, 8, 6)))
-      checkEvaluation(Milliseconds(timestamp), Decimal(0, 8, 3))
-      checkEvaluation(Microseconds(timestamp), 0)
+      checkEvaluation(millis(timestamp), Decimal(0, 8, 3))
+      checkEvaluation(micros(timestamp), 0)
 
       timestamp = timestamp.copy(sec = Literal(Decimal(BigDecimal(59.999999), 8, 6)))
-      checkEvaluation(Milliseconds(timestamp), Decimal(BigDecimal(59999.999), 8, 3))
-      checkEvaluation(Microseconds(timestamp), 59999999)
+      checkEvaluation(millis(timestamp), Decimal(BigDecimal(59999.999), 8, 3))
+      checkEvaluation(micros(timestamp), 59999999)
 
       timestamp = timestamp.copy(sec = Literal(Decimal(BigDecimal(60.0), 8, 6)))
-      checkEvaluation(Milliseconds(timestamp), Decimal(0, 8, 3))
-      checkEvaluation(Microseconds(timestamp), 0)
+      checkEvaluation(millis(timestamp), Decimal(0, 8, 3))
+      checkEvaluation(micros(timestamp), 0)
     }
   }
 
@@ -1103,15 +1112,19 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     outstandingTimezonesIds.foreach { timezone =>
       val timestamp = MakeTimestamp(Literal(2019), Literal(8), Literal(10),
         Literal(0), Literal(0), Literal(Decimal(10.123456, 8, 6)),
-        Some(Literal(timezone)))
+        Some(Literal(timezone)), Some(timezone))
+      def secFrac(ts: MakeTimestamp): SecondWithFraction = SecondWithFraction(ts, Some(timezone))
 
-      checkEvaluation(SecondWithFraction(timestamp), Decimal(10.123456, 8, 6))
+      checkEvaluation(secFrac(timestamp), Decimal(10.123456, 8, 6))
       checkEvaluation(
-        SecondWithFraction(timestamp.copy(sec = Literal(Decimal(59000001, 8, 6)))),
+        secFrac(timestamp.copy(sec = Literal(Decimal(59000001, 8, 6)))),
         Decimal(59000001, 8, 6))
       checkEvaluation(
-        SecondWithFraction(timestamp.copy(sec = Literal(Decimal(1, 8, 6)))),
+        secFrac(timestamp.copy(sec = Literal(Decimal(1, 8, 6)))),
         Decimal(0.000001, 8, 6))
+      checkEvaluation(
+        secFrac(timestamp.copy(year = Literal(10))),
+        Decimal(10.123456, 8, 6))
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
index 4dfeb85c74f93..8d16007a18f7f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId}
+import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId, ZoneOffset}
 import java.util.TimeZone
 import java.util.concurrent.TimeUnit
 
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneUTC
-
 /**
  * Helper functions for testing date and time functionality.
  */
@@ -52,8 +50,8 @@ object DateTimeTestUtils {
     }
   }
 
-  def localDateTimeToMicros(localDateTime: LocalDateTime, tz: TimeZone): Long = {
-    val instant = localDateTime.atZone(tz.toZoneId).toInstant
+  def localDateTimeToMicros(localDateTime: LocalDateTime, zoneId: ZoneId): Long = {
+    val instant = localDateTime.atZone(zoneId).toInstant
     DateTimeUtils.instantToMicros(instant)
   }
 
@@ -66,10 +64,10 @@ object DateTimeTestUtils {
       minute: Byte = 0,
       sec: Byte = 0,
       micros: Int = 0,
-      tz: TimeZone = TimeZoneUTC): Long = {
+      zid: ZoneId = ZoneOffset.UTC): Long = {
     val nanos = TimeUnit.MICROSECONDS.toNanos(micros).toInt
     val localDateTime = LocalDateTime.of(year, month, day, hour, minute, sec, nanos)
-    localDateTimeToMicros(localDateTime, tz)
+    localDateTimeToMicros(localDateTime, zid)
   }
 
   // Returns number of days since epoch for the given date
@@ -90,11 +88,11 @@ object DateTimeTestUtils {
       minute: Byte = 0,
       sec: Byte = 0,
       micros: Int = 0,
-      tz: TimeZone = TimeZoneUTC): Long = {
+      zid: ZoneId = ZoneOffset.UTC): Long = {
     val nanos = TimeUnit.MICROSECONDS.toNanos(micros).toInt
-    val localDate = LocalDate.now(tz.toZoneId)
+    val localDate = LocalDate.now(zid)
     val localTime = LocalTime.of(hour, minute, sec, nanos)
     val localDateTime = LocalDateTime.of(localDate, localTime)
-    localDateTimeToMicros(localDateTime, tz)
+    localDateTimeToMicros(localDateTime, zid)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index ff4d8a2457922..6ed18ca1bedab 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -34,7 +34,9 @@ import org.apache.spark.unsafe.types.UTF8String
 
 class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
 
-  val TimeZonePST = TimeZone.getTimeZone("PST")
+  val zonePST = getZoneId("PST")
+  val zoneGMT = getZoneId("GMT")
+
   private def defaultZoneId = ZoneId.systemDefault()
 
   test("nanoseconds truncation") {
@@ -163,87 +165,88 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       def checkStringToTimestamp(str: String, expected: Option[Long]): Unit = {
         assert(toTimestamp(str, tz.toZoneId) === expected)
       }
+      val zid = tz.toZoneId
 
-      checkStringToTimestamp("1969-12-31 16:00:00", Option(date(1969, 12, 31, 16, tz = tz)))
-      checkStringToTimestamp("0001", Option(date(1, 1, 1, 0, tz = tz)))
-      checkStringToTimestamp("2015-03", Option(date(2015, 3, 1, tz = tz)))
+      checkStringToTimestamp("1969-12-31 16:00:00", Option(date(1969, 12, 31, 16, zid = zid)))
+      checkStringToTimestamp("0001", Option(date(1, 1, 1, 0, zid = zid)))
+      checkStringToTimestamp("2015-03", Option(date(2015, 3, 1, zid = zid)))
       Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ", "2015-03-18T").foreach { s =>
-        checkStringToTimestamp(s, Option(date(2015, 3, 18, tz = tz)))
+        checkStringToTimestamp(s, Option(date(2015, 3, 18, zid = zid)))
       }
 
-      var expected = Option(date(2015, 3, 18, 12, 3, 17, tz = tz))
+      var expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zid))
       checkStringToTimestamp("2015-03-18 12:03:17", expected)
       checkStringToTimestamp("2015-03-18T12:03:17", expected)
 
       // If the string value includes timezone string, it represents the timestamp string
       // in the timezone regardless of the tz parameter.
-      var timeZone = TimeZone.getTimeZone("GMT-13:53")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, tz = timeZone))
+      var zoneId = getZoneId("GMT-13:53")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17-13:53", expected)
 
-      timeZone = TimeZone.getTimeZone("UTC")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, tz = timeZone))
+      zoneId = getZoneId("UTC")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17Z", expected)
       checkStringToTimestamp("2015-03-18 12:03:17Z", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT-01:00")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, tz = timeZone))
+      zoneId = getZoneId("GMT-01:00")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17-1:0", expected)
       checkStringToTimestamp("2015-03-18T12:03:17-01:00", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17+07:30", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:03")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, tz = timeZone))
+      zoneId = getZoneId("GMT+07:03")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17+07:03", expected)
 
       // tests for the string including milliseconds.
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, tz = tz))
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zid))
       checkStringToTimestamp("2015-03-18 12:03:17.123", expected)
       checkStringToTimestamp("2015-03-18T12:03:17.123", expected)
 
       // If the string value includes timezone string, it represents the timestamp string
       // in the timezone regardless of the tz parameter.
-      timeZone = TimeZone.getTimeZone("UTC")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 456000, tz = timeZone))
+      zoneId = getZoneId("UTC")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 456000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.456Z", expected)
       checkStringToTimestamp("2015-03-18 12:03:17.456Z", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT-01:00")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, tz = timeZone))
+      zoneId = getZoneId("GMT-01:00")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123-1:0", expected)
       checkStringToTimestamp("2015-03-18T12:03:17.123-01:00", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123+07:30", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123+07:30", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123121, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123121, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123121+7:30", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123120, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123120, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.12312+7:30", expected)
 
-      expected = Option(time(18, 12, 15, tz = tz))
+      expected = Option(time(18, 12, 15, zid = zid))
       checkStringToTimestamp("18:12:15", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(time(18, 12, 15, 123120, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(time(18, 12, 15, 123120, zid = zoneId))
       checkStringToTimestamp("T18:12:15.12312+7:30", expected)
 
-      timeZone = TimeZone.getTimeZone("GMT+07:30")
-      expected = Option(time(18, 12, 15, 123120, tz = timeZone))
+      zoneId = getZoneId("GMT+07:30")
+      expected = Option(time(18, 12, 15, 123120, zid = zoneId))
       checkStringToTimestamp("18:12:15.12312+7:30", expected)
 
-      expected = Option(date(2011, 5, 6, 7, 8, 9, 100000, tz = tz))
+      expected = Option(date(2011, 5, 6, 7, 8, 9, 100000, zid = zid))
       checkStringToTimestamp("2011-05-06 07:08:09.1000", expected)
 
       checkStringToTimestamp("238", None)
@@ -265,8 +268,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       checkStringToTimestamp("1999 08", None)
 
       // Truncating the fractional seconds
-      timeZone = TimeZone.getTimeZone("GMT+00:00")
-      expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, tz = timeZone))
+      zoneId = getZoneId("GMT+00:00")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, zid = zoneId))
       checkStringToTimestamp(
         "2015-03-18T12:03:17.123456789+0:00", expected)
     }
@@ -290,32 +293,38 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
   }
 
   test("hours") {
-    var input = date(2015, 3, 18, 13, 2, 11, 0, TimeZonePST)
-    assert(getHours(input, TimeZonePST) === 13)
-    assert(getHours(input, TimeZoneGMT) === 20)
-    input = date(2015, 12, 8, 2, 7, 9, 0, TimeZonePST)
-    assert(getHours(input, TimeZonePST) === 2)
-    assert(getHours(input, TimeZoneGMT) === 10)
+    var input = date(2015, 3, 18, 13, 2, 11, 0, zonePST)
+    assert(getHours(input, zonePST) === 13)
+    assert(getHours(input, zoneGMT) === 20)
+    input = date(2015, 12, 8, 2, 7, 9, 0, zonePST)
+    assert(getHours(input, zonePST) === 2)
+    assert(getHours(input, zoneGMT) === 10)
+    input = date(10, 1, 1, 0, 0, 0, 0, zonePST)
+    assert(getHours(input, zonePST) === 0)
   }
 
   test("minutes") {
-    var input = date(2015, 3, 18, 13, 2, 11, 0, TimeZonePST)
-    assert(getMinutes(input, TimeZonePST) === 2)
-    assert(getMinutes(input, TimeZoneGMT) === 2)
-    assert(getMinutes(input, TimeZone.getTimeZone("Australia/North")) === 32)
-    input = date(2015, 3, 8, 2, 7, 9, 0, TimeZonePST)
-    assert(getMinutes(input, TimeZonePST) === 7)
-    assert(getMinutes(input, TimeZoneGMT) === 7)
-    assert(getMinutes(input, TimeZone.getTimeZone("Australia/North")) === 37)
+    var input = date(2015, 3, 18, 13, 2, 11, 0, zonePST)
+    assert(getMinutes(input, zonePST) === 2)
+    assert(getMinutes(input, zoneGMT) === 2)
+    assert(getMinutes(input, getZoneId("Australia/North")) === 32)
+    input = date(2015, 3, 8, 2, 7, 9, 0, zonePST)
+    assert(getMinutes(input, zonePST) === 7)
+    assert(getMinutes(input, zoneGMT) === 7)
+    assert(getMinutes(input, getZoneId("Australia/North")) === 37)
+    input = date(10, 1, 1, 0, 0, 0, 0, zonePST)
+    assert(getMinutes(input, zonePST) === 0)
   }
 
   test("seconds") {
-    var input = date(2015, 3, 18, 13, 2, 11, 0, TimeZonePST)
-    assert(getSeconds(input, TimeZonePST) === 11)
-    assert(getSeconds(input, TimeZoneGMT) === 11)
-    input = date(2015, 3, 8, 2, 7, 9, 0, TimeZonePST)
-    assert(getSeconds(input, TimeZonePST) === 9)
-    assert(getSeconds(input, TimeZoneGMT) === 9)
+    var input = date(2015, 3, 18, 13, 2, 11, 0, zonePST)
+    assert(getSeconds(input, zonePST) === 11)
+    assert(getSeconds(input, zoneGMT) === 11)
+    input = date(2015, 3, 8, 2, 7, 9, 0, zonePST)
+    assert(getSeconds(input, zonePST) === 9)
+    assert(getSeconds(input, zoneGMT) === 9)
+    input = date(10, 1, 1, 0, 0, 0, 0, zonePST)
+    assert(getSeconds(input, zonePST) === 0)
   }
 
   test("hours / minutes / seconds") {
@@ -381,37 +390,37 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
     val ts2 = date(2000, 2, 28, 10, 30, 0, 123000)
     assert(timestampAddInterval(ts1, 36, 0, 123000, defaultZoneId) === ts2)
 
-    val ts3 = date(1997, 2, 27, 16, 0, 0, 0, TimeZonePST)
-    val ts4 = date(2000, 2, 27, 16, 0, 0, 123000, TimeZonePST)
-    val ts5 = date(2000, 2, 28, 0, 0, 0, 123000, TimeZoneGMT)
-    assert(timestampAddInterval(ts3, 36, 0, 123000, TimeZonePST.toZoneId) === ts4)
-    assert(timestampAddInterval(ts3, 36, 0, 123000, TimeZoneGMT.toZoneId) === ts5)
+    val ts3 = date(1997, 2, 27, 16, 0, 0, 0, zonePST)
+    val ts4 = date(2000, 2, 27, 16, 0, 0, 123000, zonePST)
+    val ts5 = date(2000, 2, 28, 0, 0, 0, 123000, zoneGMT)
+    assert(timestampAddInterval(ts3, 36, 0, 123000, zonePST) === ts4)
+    assert(timestampAddInterval(ts3, 36, 0, 123000, zoneGMT) === ts5)
   }
 
   test("timestamp add days") {
     // 2019-3-9 is the end of Pacific Standard Time
-    val ts1 = date(2019, 3, 9, 12, 0, 0, 123000, TimeZonePST)
+    val ts1 = date(2019, 3, 9, 12, 0, 0, 123000, zonePST)
     // 2019-3-10 is the start of Pacific Daylight Time
-    val ts2 = date(2019, 3, 10, 12, 0, 0, 123000, TimeZonePST)
-    val ts3 = date(2019, 5, 9, 12, 0, 0, 123000, TimeZonePST)
-    val ts4 = date(2019, 5, 10, 12, 0, 0, 123000, TimeZonePST)
+    val ts2 = date(2019, 3, 10, 12, 0, 0, 123000, zonePST)
+    val ts3 = date(2019, 5, 9, 12, 0, 0, 123000, zonePST)
+    val ts4 = date(2019, 5, 10, 12, 0, 0, 123000, zonePST)
     // 2019-11-2 is the end of Pacific Daylight Time
-    val ts5 = date(2019, 11, 2, 12, 0, 0, 123000, TimeZonePST)
+    val ts5 = date(2019, 11, 2, 12, 0, 0, 123000, zonePST)
     // 2019-11-3 is the start of Pacific Standard Time
-    val ts6 = date(2019, 11, 3, 12, 0, 0, 123000, TimeZonePST)
+    val ts6 = date(2019, 11, 3, 12, 0, 0, 123000, zonePST)
 
     // transit from Pacific Standard Time to Pacific Daylight Time
     assert(timestampAddInterval(
-      ts1, 0, 0, 23 * MICROS_PER_HOUR, TimeZonePST.toZoneId) === ts2)
-    assert(timestampAddInterval(ts1, 0, 1, 0, TimeZonePST.toZoneId) === ts2)
+      ts1, 0, 0, 23 * MICROS_PER_HOUR, zonePST) === ts2)
+    assert(timestampAddInterval(ts1, 0, 1, 0, zonePST) === ts2)
     // just a normal day
     assert(timestampAddInterval(
-      ts3, 0, 0, 24 * MICROS_PER_HOUR, TimeZonePST.toZoneId) === ts4)
-    assert(timestampAddInterval(ts3, 0, 1, 0, TimeZonePST.toZoneId) === ts4)
+      ts3, 0, 0, 24 * MICROS_PER_HOUR, zonePST) === ts4)
+    assert(timestampAddInterval(ts3, 0, 1, 0, zonePST) === ts4)
     // transit from Pacific Daylight Time to Pacific Standard Time
     assert(timestampAddInterval(
-      ts5, 0, 0, 25 * MICROS_PER_HOUR, TimeZonePST.toZoneId) === ts6)
-    assert(timestampAddInterval(ts5, 0, 1, 0, TimeZonePST.toZoneId) === ts6)
+      ts5, 0, 0, 25 * MICROS_PER_HOUR, zonePST) === ts6)
+    assert(timestampAddInterval(ts5, 0, 1, 0, zonePST) === ts6)
   }
 
   test("monthsBetween") {
@@ -428,9 +437,9 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       assert(monthsBetween(date1, date2, roundOff, ZoneOffset.UTC) === 11)
     }
 
-    val date3 = date(2000, 2, 28, 16, tz = TimeZonePST)
-    val date4 = date(1997, 2, 28, 16, tz = TimeZonePST)
-    assert(monthsBetween(date3, date4, true, TimeZonePST.toZoneId) === 36.0)
+    val date3 = date(2000, 2, 28, 16, zid = zonePST)
+    val date4 = date(1997, 2, 28, 16, zid = zonePST)
+    assert(monthsBetween(date3, date4, true, zonePST) === 36.0)
     assert(monthsBetween(date3, date4, true, ZoneOffset.UTC) === 35.90322581)
     assert(monthsBetween(date3, date4, false, ZoneOffset.UTC) === 35.903225806451616)
   }
@@ -575,15 +584,15 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
   }
 
   test("daysToMillis and millisToDays") {
-    val input = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, 16, tz = TimeZonePST))
-    assert(millisToDays(input, TimeZonePST.toZoneId) === 16800)
+    val input = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, 16, zid = zonePST))
+    assert(millisToDays(input, zonePST) === 16800)
     assert(millisToDays(input, ZoneOffset.UTC) === 16801)
     assert(millisToDays(-1 * MILLIS_PER_DAY + 1, ZoneOffset.UTC) == -1)
 
-    var expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, tz = TimeZonePST))
-    assert(daysToMillis(16800, TimeZonePST.toZoneId) === expected)
+    var expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, zid = zonePST))
+    assert(daysToMillis(16800, zonePST) === expected)
 
-    expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, tz = TimeZoneGMT))
+    expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, zid = zoneGMT))
     assert(daysToMillis(16800, ZoneOffset.UTC) === expected)
 
     // There are some days are skipped entirely in some timezone, skip them here.
diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
index 729b112de5e75..7d9b147d8d053 100644
--- a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
@@ -2,428 +2,428 @@
 Extract components
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast to timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast to timestamp wholestage off                    546            572          36         18.3          54.6       1.0X
-cast to timestamp wholestage on                     412            438          16         24.3          41.2       1.3X
+cast to timestamp wholestage off                    408            445          53         24.5          40.8       1.0X
+cast to timestamp wholestage on                     401            453          63         24.9          40.1       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 year of timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-year of timestamp wholestage off                   1240           1295          77          8.1         124.0       1.0X
-year of timestamp wholestage on                    1109           1130          24          9.0         110.9       1.1X
+year of timestamp wholestage off                   1197           1246          69          8.4         119.7       1.0X
+year of timestamp wholestage on                    1111           1123          10          9.0         111.1       1.1X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 quarter of timestamp:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-quarter of timestamp wholestage off                1572           1574           3          6.4         157.2       1.0X
-quarter of timestamp wholestage on                 1386           1405          18          7.2         138.6       1.1X
+quarter of timestamp wholestage off                1451           1462          16          6.9         145.1       1.0X
+quarter of timestamp wholestage on                 1409           1424          13          7.1         140.9       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 month of timestamp:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-month of timestamp wholestage off                  1194           1196           2          8.4         119.4       1.0X
-month of timestamp wholestage on                   1057           1069          12          9.5         105.7       1.1X
+month of timestamp wholestage off                  1106           1109           3          9.0         110.6       1.0X
+month of timestamp wholestage on                   1092           1095           4          9.2         109.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 weekofyear of timestamp:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-weekofyear of timestamp wholestage off             2070           2071           2          4.8         207.0       1.0X
-weekofyear of timestamp wholestage on              1549           1555           6          6.5         154.9       1.3X
+weekofyear of timestamp wholestage off             1675           1677           3          6.0         167.5       1.0X
+weekofyear of timestamp wholestage on              1616           1625           7          6.2         161.6       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 day of timestamp:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-day of timestamp wholestage off                    1173           1186          18          8.5         117.3       1.0X
-day of timestamp wholestage on                     1056           1076          26          9.5         105.6       1.1X
+day of timestamp wholestage off                    1111           1115           6          9.0         111.1       1.0X
+day of timestamp wholestage on                     1090           1098          10          9.2         109.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofyear of timestamp:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofyear of timestamp wholestage off              1207           1211           6          8.3         120.7       1.0X
-dayofyear of timestamp wholestage on               1097           1108           9          9.1         109.7       1.1X
+dayofyear of timestamp wholestage off              1162           1167           7          8.6         116.2       1.0X
+dayofyear of timestamp wholestage on               1136           1142           6          8.8         113.6       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofmonth of timestamp:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofmonth of timestamp wholestage off             1184           1190           8          8.4         118.4       1.0X
-dayofmonth of timestamp wholestage on              1053           1060           9          9.5         105.3       1.1X
+dayofmonth of timestamp wholestage off             1102           1107           7          9.1         110.2       1.0X
+dayofmonth of timestamp wholestage on              1092           1111          16          9.2         109.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofweek of timestamp:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofweek of timestamp wholestage off              1343           1362          27          7.4         134.3       1.0X
-dayofweek of timestamp wholestage on               1228           1239           7          8.1         122.8       1.1X
+dayofweek of timestamp wholestage off              1254           1260           9          8.0         125.4       1.0X
+dayofweek of timestamp wholestage on               1264           1269           7          7.9         126.4       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 weekday of timestamp:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-weekday of timestamp wholestage off                1276           1278           3          7.8         127.6       1.0X
-weekday of timestamp wholestage on                 1160           1181          22          8.6         116.0       1.1X
+weekday of timestamp wholestage off                1211           1215           5          8.3         121.1       1.0X
+weekday of timestamp wholestage on                 1197           1206          12          8.4         119.7       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 hour of timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-hour of timestamp wholestage off                    854            862          12         11.7          85.4       1.0X
-hour of timestamp wholestage on                     741            748           6         13.5          74.1       1.2X
+hour of timestamp wholestage off                    899            904           6         11.1          89.9       1.0X
+hour of timestamp wholestage on                     823            827           3         12.2          82.3       1.1X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 minute of timestamp:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-minute of timestamp wholestage off                  853            854           1         11.7          85.3       1.0X
-minute of timestamp wholestage on                   730            737          11         13.7          73.0       1.2X
+minute of timestamp wholestage off                  906            923          24         11.0          90.6       1.0X
+minute of timestamp wholestage on                   821            830          13         12.2          82.1       1.1X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 second of timestamp:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-second of timestamp wholestage off                  726            728           2         13.8          72.6       1.0X
-second of timestamp wholestage on                   614            623           9         16.3          61.4       1.2X
+second of timestamp wholestage off                  902            908           7         11.1          90.2       1.0X
+second of timestamp wholestage on                   823            833           9         12.2          82.3       1.1X
 
 
 ================================================================================================
 Current date and time
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 current_date:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-current_date wholestage off                         369            370           2         27.1          36.9       1.0X
-current_date wholestage on                          277            284           8         36.2          27.7       1.3X
+current_date wholestage off                         300            301           2         33.3          30.0       1.0X
+current_date wholestage on                          309            312           3         32.4          30.9       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 current_timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-current_timestamp wholestage off                    410            411           1         24.4          41.0       1.0X
-current_timestamp wholestage on                     283            418         259         35.4          28.3       1.5X
+current_timestamp wholestage off                    303            337          47         33.0          30.3       1.0X
+current_timestamp wholestage on                     320            448         160         31.3          32.0       0.9X
 
 
 ================================================================================================
 Date arithmetic
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast to date:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast to date wholestage off                         987            992           7         10.1          98.7       1.0X
-cast to date wholestage on                          891            896           4         11.2          89.1       1.1X
+cast to date wholestage off                         910            912           2         11.0          91.0       1.0X
+cast to date wholestage on                          930            937           6         10.8          93.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 last_day:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-last_day wholestage off                            1179           1179           1          8.5         117.9       1.0X
-last_day wholestage on                             1052           1080          44          9.5         105.2       1.1X
+last_day wholestage off                            1122           1123           1          8.9         112.2       1.0X
+last_day wholestage on                             1100           1109           6          9.1         110.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 next_day:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-next_day wholestage off                            1073           1081          11          9.3         107.3       1.0X
-next_day wholestage on                              948            955          10         10.5          94.8       1.1X
+next_day wholestage off                             991            992           2         10.1          99.1       1.0X
+next_day wholestage on                              981            984           4         10.2          98.1       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_add:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_add wholestage off                            1006           1009           5          9.9         100.6       1.0X
-date_add wholestage on                              867            870           4         11.5          86.7       1.2X
+date_add wholestage off                             927            927           1         10.8          92.7       1.0X
+date_add wholestage on                              908            915          12         11.0          90.8       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_sub:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_sub wholestage off                             980            988          11         10.2          98.0       1.0X
-date_sub wholestage on                              866            873           9         11.6          86.6       1.1X
+date_sub wholestage off                             921            921           0         10.9          92.1       1.0X
+date_sub wholestage on                              907            910           4         11.0          90.7       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 add_months:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-add_months wholestage off                          1329           1332           4          7.5         132.9       1.0X
-add_months wholestage on                           1199           1206           8          8.3         119.9       1.1X
+add_months wholestage off                          1255           1257           3          8.0         125.5       1.0X
+add_months wholestage on                           1242           1259          13          8.1         124.2       1.0X
 
 
 ================================================================================================
 Formatting dates
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 format date:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-format date wholestage off                         4724           4736          18          2.1         472.4       1.0X
-format date wholestage on                          4550           4574          26          2.2         455.0       1.0X
+format date wholestage off                         4912           4968          79          2.0         491.2       1.0X
+format date wholestage on                          4734           4757          16          2.1         473.4       1.0X
 
 
 ================================================================================================
 Formatting timestamps
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 from_unixtime:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-from_unixtime wholestage off                       7171           7183          17          1.4         717.1       1.0X
-from_unixtime wholestage on                        7114           7141          20          1.4         711.4       1.0X
+from_unixtime wholestage off                       8692           8701          13          1.2         869.2       1.0X
+from_unixtime wholestage on                        8717           8730          14          1.1         871.7       1.0X
 
 
 ================================================================================================
 Convert timestamps
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 from_utc_timestamp:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-from_utc_timestamp wholestage off                  1498           1504           8          6.7         149.8       1.0X
-from_utc_timestamp wholestage on                   1399           1405           5          7.1         139.9       1.1X
+from_utc_timestamp wholestage off                  1339           1348          12          7.5         133.9       1.0X
+from_utc_timestamp wholestage on                   1398           1404           6          7.2         139.8       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_utc_timestamp:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_utc_timestamp wholestage off                    1505           1507           2          6.6         150.5       1.0X
-to_utc_timestamp wholestage on                     1396           1401           5          7.2         139.6       1.1X
+to_utc_timestamp wholestage off                    1936           1951          21          5.2         193.6       1.0X
+to_utc_timestamp wholestage on                     1783           1792           8          5.6         178.3       1.1X
 
 
 ================================================================================================
 Intervals
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast interval:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast interval wholestage off                        423            428           7         23.6          42.3       1.0X
-cast interval wholestage on                         300            302           2         33.3          30.0       1.4X
+cast interval wholestage off                        350            353           4         28.6          35.0       1.0X
+cast interval wholestage on                         348            355           4         28.7          34.8       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 datediff:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-datediff wholestage off                            1626           1630           6          6.1         162.6       1.0X
-datediff wholestage on                             1467           1471           3          6.8         146.7       1.1X
+datediff wholestage off                            1679           1680           2          6.0         167.9       1.0X
+datediff wholestage on                             1674           1687          11          6.0         167.4       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 months_between:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-months_between wholestage off                      1988           1992           5          5.0         198.8       1.0X
-months_between wholestage on                       1812           1834          24          5.5         181.2       1.1X
+months_between wholestage off                      4116           4120           5          2.4         411.6       1.0X
+months_between wholestage on                       4092           4140          90          2.4         409.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 window:                                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-window wholestage off                              2277           2334          80          0.4        2277.1       1.0X
-window wholestage on                              48996          49048          67          0.0       48996.0       0.0X
+window wholestage off                              2236           2396         227          0.4        2235.9       1.0X
+window wholestage on                              47208          47298         171          0.0       47208.3       0.0X
 
 
 ================================================================================================
 Truncation
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YEAR:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YEAR wholestage off                      867            870           6         11.5          86.7       1.0X
-date_trunc YEAR wholestage on                       815            819           6         12.3          81.5       1.1X
+date_trunc YEAR wholestage off                     1939           1947          11          5.2         193.9       1.0X
+date_trunc YEAR wholestage on                      1890           1899           6          5.3         189.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YYYY:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YYYY wholestage off                      866            875          13         11.5          86.6       1.0X
-date_trunc YYYY wholestage on                       811            813           2         12.3          81.1       1.1X
+date_trunc YYYY wholestage off                     1947           1950           5          5.1         194.7       1.0X
+date_trunc YYYY wholestage on                      1903           1909           7          5.3         190.3       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YY:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YY wholestage off                        864            867           4         11.6          86.4       1.0X
-date_trunc YY wholestage on                         812            824          10         12.3          81.2       1.1X
+date_trunc YY wholestage off                       1941           1952          15          5.2         194.1       1.0X
+date_trunc YY wholestage on                        1895           1907          10          5.3         189.5       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MON:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MON wholestage off                       881            884           4         11.3          88.1       1.0X
-date_trunc MON wholestage on                        820            826           7         12.2          82.0       1.1X
+date_trunc MON wholestage off                      1912           1934          30          5.2         191.2       1.0X
+date_trunc MON wholestage on                       1899           1923          20          5.3         189.9       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MONTH:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MONTH wholestage off                     880            881           2         11.4          88.0       1.0X
-date_trunc MONTH wholestage on                      819            822           4         12.2          81.9       1.1X
+date_trunc MONTH wholestage off                    1923           1931          11          5.2         192.3       1.0X
+date_trunc MONTH wholestage on                     1897           1908          10          5.3         189.7       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MM:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MM wholestage off                        889            904          21         11.2          88.9       1.0X
-date_trunc MM wholestage on                         818            828           8         12.2          81.8       1.1X
+date_trunc MM wholestage off                       1923           1924           2          5.2         192.3       1.0X
+date_trunc MM wholestage on                        1902           1911          10          5.3         190.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc DAY:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc DAY wholestage off                       590            593           4         16.9          59.0       1.0X
-date_trunc DAY wholestage on                        510            514           4         19.6          51.0       1.2X
+date_trunc DAY wholestage off                      1713           1717           6          5.8         171.3       1.0X
+date_trunc DAY wholestage on                       1693           1699           4          5.9         169.3       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc DD:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc DD wholestage off                        596            604          11         16.8          59.6       1.0X
-date_trunc DD wholestage on                         511            519           9         19.6          51.1       1.2X
+date_trunc DD wholestage off                       1711           1712           1          5.8         171.1       1.0X
+date_trunc DD wholestage on                        1691           1697          12          5.9         169.1       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc HOUR:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc HOUR wholestage off                      586            592           9         17.1          58.6       1.0X
-date_trunc HOUR wholestage on                       507            513           5         19.7          50.7       1.2X
+date_trunc HOUR wholestage off                     1715           1715           0          5.8         171.5       1.0X
+date_trunc HOUR wholestage on                      1699           1705           4          5.9         169.9       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MINUTE:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MINUTE wholestage off                    561            562           1         17.8          56.1       1.0X
-date_trunc MINUTE wholestage on                     480            485           4         20.8          48.0       1.2X
+date_trunc MINUTE wholestage off                    520            524           7         19.2          52.0       1.0X
+date_trunc MINUTE wholestage on                     500            505           6         20.0          50.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc SECOND:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc SECOND wholestage off                    561            561           1         17.8          56.1       1.0X
-date_trunc SECOND wholestage on                     479            480           2         20.9          47.9       1.2X
+date_trunc SECOND wholestage off                    501            502           1         20.0          50.1       1.0X
+date_trunc SECOND wholestage on                     497            501           3         20.1          49.7       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc WEEK:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc WEEK wholestage off                      725            727           2         13.8          72.5       1.0X
-date_trunc WEEK wholestage on                       674            684          11         14.8          67.4       1.1X
+date_trunc WEEK wholestage off                     1839           1840           1          5.4         183.9       1.0X
+date_trunc WEEK wholestage on                      1820           1827           8          5.5         182.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc QUARTER:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc QUARTER wholestage off                  1653           1659          10          6.1         165.3       1.0X
-date_trunc QUARTER wholestage on                   1588           1601          12          6.3         158.8       1.0X
+date_trunc QUARTER wholestage off                  2624           2624           0          3.8         262.4       1.0X
+date_trunc QUARTER wholestage on                   2573           2583           7          3.9         257.3       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc year:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc year wholestage off                           391            393           2         25.6          39.1       1.0X
-trunc year wholestage on                            312            316           4         32.1          31.2       1.3X
+trunc year wholestage off                           315            317           3         31.8          31.5       1.0X
+trunc year wholestage on                            314            328          16         31.9          31.4       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc yyyy:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc yyyy wholestage off                           390            394           5         25.6          39.0       1.0X
-trunc yyyy wholestage on                            316            319           4         31.6          31.6       1.2X
+trunc yyyy wholestage off                           314            317           4         31.8          31.4       1.0X
+trunc yyyy wholestage on                            312            315           3         32.1          31.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc yy:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc yy wholestage off                             387            390           5         25.8          38.7       1.0X
-trunc yy wholestage on                              313            316           3         31.9          31.3       1.2X
+trunc yy wholestage off                             314            315           0         31.8          31.4       1.0X
+trunc yy wholestage on                              312            317           6         32.0          31.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc mon:                                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc mon wholestage off                            388            395          11         25.8          38.8       1.0X
-trunc mon wholestage on                             314            316           2         31.8          31.4       1.2X
+trunc mon wholestage off                            314            315           1         31.8          31.4       1.0X
+trunc mon wholestage on                             310            314           4         32.2          31.0       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc month:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc month wholestage off                          388            390           4         25.8          38.8       1.0X
-trunc month wholestage on                           315            318           2         31.7          31.5       1.2X
+trunc month wholestage off                          313            314           2         32.0          31.3       1.0X
+trunc month wholestage on                           312            316           6         32.1          31.2       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc mm:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc mm wholestage off                             384            388           4         26.0          38.4       1.0X
-trunc mm wholestage on                              314            321           9         31.9          31.4       1.2X
+trunc mm wholestage off                             312            313           1         32.0          31.2       1.0X
+trunc mm wholestage on                              309            312           3         32.3          30.9       1.0X
 
 
 ================================================================================================
 Parsing
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to timestamp str:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to timestamp str wholestage off                     178            191          18          5.6         178.5       1.0X
-to timestamp str wholestage on                      157            160           2          6.4         156.6       1.1X
+to timestamp str wholestage off                     170            173           5          5.9         170.0       1.0X
+to timestamp str wholestage on                      156            161           5          6.4         155.9       1.1X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_timestamp:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_timestamp wholestage off                        1790           1795           7          0.6        1790.0       1.0X
-to_timestamp wholestage on                         1813           1820          10          0.6        1813.1       1.0X
+to_timestamp wholestage off                        1741           1746           8          0.6        1740.7       1.0X
+to_timestamp wholestage on                         1887           1896          11          0.5        1886.6       0.9X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_unix_timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_unix_timestamp wholestage off                   1836           1837           1          0.5        1835.8       1.0X
-to_unix_timestamp wholestage on                    1786           1791           3          0.6        1785.8       1.0X
+to_unix_timestamp wholestage off                   1928           1934           9          0.5        1927.7       1.0X
+to_unix_timestamp wholestage on                    1857           1862           3          0.5        1857.5       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to date str:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to date str wholestage off                          169            169           1          5.9         168.8       1.0X
-to date str wholestage on                           151            153           2          6.6         151.2       1.1X
+to date str wholestage off                          161            162           0          6.2         161.5       1.0X
+to date str wholestage on                           151            156           3          6.6         151.4       1.1X
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_date:                                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_date wholestage off                             2504           2512          10          0.4        2504.4       1.0X
-to_date wholestage on                              2522           2536          19          0.4        2522.3       1.0X
+to_date wholestage off                             2513           2519           8          0.4        2513.1       1.0X
+to_date wholestage on                              2484           2500          15          0.4        2484.2       1.0X
 
 
 ================================================================================================
 Conversion from/to external types
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 To/from java.sql.Timestamp:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-From java.sql.Timestamp                             346            367          35         14.5          69.2       1.0X
-Collect longs                                      2139           2329         289          2.3         427.7       0.2X
-Collect timestamps                                 1883           2086         303          2.7         376.5       0.2X
+From java.sql.Timestamp                             340            347           8         14.7          68.0       1.0X
+Collect longs                                      1170           1217          42          4.3         234.1       0.3X
+Collect timestamps                                 1771           2267         836          2.8         354.1       0.2X
 
 
diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt
index 0c30534dd71ca..3567a60d1b0b8 100644
--- a/sql/core/benchmarks/DateTimeBenchmark-results.txt
+++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt
@@ -2,428 +2,428 @@
 Extract components
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast to timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast to timestamp wholestage off                    425            447          30         23.5          42.5       1.0X
-cast to timestamp wholestage on                     368            401          29         27.2          36.8       1.2X
+cast to timestamp wholestage off                    447            462          21         22.4          44.7       1.0X
+cast to timestamp wholestage on                     390            426          54         25.7          39.0       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 year of timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-year of timestamp wholestage off                   1158           1215          80          8.6         115.8       1.0X
-year of timestamp wholestage on                    1158           1179          31          8.6         115.8       1.0X
+year of timestamp wholestage off                   1189           1285         135          8.4         118.9       1.0X
+year of timestamp wholestage on                    1146           1156           9          8.7         114.6       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 quarter of timestamp:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-quarter of timestamp wholestage off                1285           1295          15          7.8         128.5       1.0X
-quarter of timestamp wholestage on                 1243           1257          11          8.0         124.3       1.0X
+quarter of timestamp wholestage off                1290           1293           4          7.8         129.0       1.0X
+quarter of timestamp wholestage on                 1237           1251          13          8.1         123.7       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 month of timestamp:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-month of timestamp wholestage off                  1076           1082           8          9.3         107.6       1.0X
-month of timestamp wholestage on                   1088           1098           9          9.2         108.8       1.0X
+month of timestamp wholestage off                  1096           1101           7          9.1         109.6       1.0X
+month of timestamp wholestage on                   1088           1095           7          9.2         108.8       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 weekofyear of timestamp:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-weekofyear of timestamp wholestage off             1649           1659          14          6.1         164.9       1.0X
-weekofyear of timestamp wholestage on              1648           1656           8          6.1         164.8       1.0X
+weekofyear of timestamp wholestage off             1635           1636           1          6.1         163.5       1.0X
+weekofyear of timestamp wholestage on              1711           1714           4          5.8         171.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 day of timestamp:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-day of timestamp wholestage off                    1083           1084           3          9.2         108.3       1.0X
-day of timestamp wholestage on                     1082           1089          13          9.2         108.2       1.0X
+day of timestamp wholestage off                    1094           1108          20          9.1         109.4       1.0X
+day of timestamp wholestage on                     1083           1092           8          9.2         108.3       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofyear of timestamp:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofyear of timestamp wholestage off              1102           1103           1          9.1         110.2       1.0X
-dayofyear of timestamp wholestage on               1123           1138          14          8.9         112.3       1.0X
+dayofyear of timestamp wholestage off              1145           1145           1          8.7         114.5       1.0X
+dayofyear of timestamp wholestage on               1131           1141          12          8.8         113.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofmonth of timestamp:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofmonth of timestamp wholestage off             1068           1073           7          9.4         106.8       1.0X
-dayofmonth of timestamp wholestage on              1082           1095          13          9.2         108.2       1.0X
+dayofmonth of timestamp wholestage off             1112           1124          17          9.0         111.2       1.0X
+dayofmonth of timestamp wholestage on              1082           1096          14          9.2         108.2       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 dayofweek of timestamp:                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-dayofweek of timestamp wholestage off              1265           1294          41          7.9         126.5       1.0X
-dayofweek of timestamp wholestage on               1253           1262          11          8.0         125.3       1.0X
+dayofweek of timestamp wholestage off              1254           1255           2          8.0         125.4       1.0X
+dayofweek of timestamp wholestage on               1250           1260          10          8.0         125.0       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 weekday of timestamp:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-weekday of timestamp wholestage off                1189           1191           3          8.4         118.9       1.0X
-weekday of timestamp wholestage on                 1193           1199           6          8.4         119.3       1.0X
+weekday of timestamp wholestage off                1190           1196           8          8.4         119.0       1.0X
+weekday of timestamp wholestage on                 1204           1218          11          8.3         120.4       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 hour of timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-hour of timestamp wholestage off                    366            368           3         27.3          36.6       1.0X
-hour of timestamp wholestage on                     360            364           6         27.8          36.0       1.0X
+hour of timestamp wholestage off                    859            862           4         11.6          85.9       1.0X
+hour of timestamp wholestage on                     825            827           2         12.1          82.5       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 minute of timestamp:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-minute of timestamp wholestage off                  348            350           2         28.7          34.8       1.0X
-minute of timestamp wholestage on                   355            361           9         28.1          35.5       1.0X
+minute of timestamp wholestage off                  841            851          15         11.9          84.1       1.0X
+minute of timestamp wholestage on                   817            824           6         12.2          81.7       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 second of timestamp:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-second of timestamp wholestage off                  347            352           7         28.8          34.7       1.0X
-second of timestamp wholestage on                   351            359          10         28.5          35.1       1.0X
+second of timestamp wholestage off                  901            905           5         11.1          90.1       1.0X
+second of timestamp wholestage on                   830            844          12         12.1          83.0       1.1X
 
 
 ================================================================================================
 Current date and time
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 current_date:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-current_date wholestage off                         284            287           4         35.2          28.4       1.0X
-current_date wholestage on                          312            318           6         32.1          31.2       0.9X
+current_date wholestage off                         290            292           2         34.4          29.0       1.0X
+current_date wholestage on                          301            309          10         33.3          30.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 current_timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-current_timestamp wholestage off                    291            292           2         34.4          29.1       1.0X
-current_timestamp wholestage on                     297            333          40         33.6          29.7       1.0X
+current_timestamp wholestage off                    300            301           0         33.3          30.0       1.0X
+current_timestamp wholestage on                     316            348          36         31.6          31.6       0.9X
 
 
 ================================================================================================
 Date arithmetic
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast to date:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast to date wholestage off                         903            903           1         11.1          90.3       1.0X
-cast to date wholestage on                          897            900           7         11.2          89.7       1.0X
+cast to date wholestage off                         964            973          13         10.4          96.4       1.0X
+cast to date wholestage on                          900            905           4         11.1          90.0       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 last_day:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-last_day wholestage off                            1082           1082           1          9.2         108.2       1.0X
-last_day wholestage on                             1107           1118          16          9.0         110.7       1.0X
+last_day wholestage off                            1125           1138          19          8.9         112.5       1.0X
+last_day wholestage on                             1111           1122          12          9.0         111.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 next_day:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-next_day wholestage off                             968            974           8         10.3          96.8       1.0X
-next_day wholestage on                              958            959           1         10.4          95.8       1.0X
+next_day wholestage off                             970            982          16         10.3          97.0       1.0X
+next_day wholestage on                              955            958           4         10.5          95.5       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_add:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_add wholestage off                             894            895           1         11.2          89.4       1.0X
-date_add wholestage on                              882            890           9         11.3          88.2       1.0X
+date_add wholestage off                             894            914          29         11.2          89.4       1.0X
+date_add wholestage on                              880            884           4         11.4          88.0       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_sub:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_sub wholestage off                             892            896           6         11.2          89.2       1.0X
-date_sub wholestage on                              881            888           7         11.3          88.1       1.0X
+date_sub wholestage off                             901            901           1         11.1          90.1       1.0X
+date_sub wholestage on                              883            892          16         11.3          88.3       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 add_months:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-add_months wholestage off                          1221           1223           3          8.2         122.1       1.0X
-add_months wholestage on                           1212           1217           5          8.2         121.2       1.0X
+add_months wholestage off                          1212           1213           0          8.2         121.2       1.0X
+add_months wholestage on                           1212           1219          11          8.3         121.2       1.0X
 
 
 ================================================================================================
 Formatting dates
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 format date:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-format date wholestage off                         4989           5009          29          2.0         498.9       1.0X
-format date wholestage on                          5037           5055          26          2.0         503.7       1.0X
+format date wholestage off                         4973           5069         136          2.0         497.3       1.0X
+format date wholestage on                          5061           5075          18          2.0         506.1       1.0X
 
 
 ================================================================================================
 Formatting timestamps
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 from_unixtime:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-from_unixtime wholestage off                       9157           9164          10          1.1         915.7       1.0X
-from_unixtime wholestage on                        9101           9120          16          1.1         910.1       1.0X
+from_unixtime wholestage off                       8827           8835          11          1.1         882.7       1.0X
+from_unixtime wholestage on                        8840           8845           5          1.1         884.0       1.0X
 
 
 ================================================================================================
 Convert timestamps
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 from_utc_timestamp:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-from_utc_timestamp wholestage off                   732            739          10         13.7          73.2       1.0X
-from_utc_timestamp wholestage on                    767            776           8         13.0          76.7       1.0X
+from_utc_timestamp wholestage off                  1143           1145           2          8.7         114.3       1.0X
+from_utc_timestamp wholestage on                   1174           1187          12          8.5         117.4       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_utc_timestamp:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_utc_timestamp wholestage off                     802            805           3         12.5          80.2       1.0X
-to_utc_timestamp wholestage on                      776            781           5         12.9          77.6       1.0X
+to_utc_timestamp wholestage off                    1721           1725           7          5.8         172.1       1.0X
+to_utc_timestamp wholestage on                     1605           1613           5          6.2         160.5       1.1X
 
 
 ================================================================================================
 Intervals
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 cast interval:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-cast interval wholestage off                        328            330           3         30.5          32.8       1.0X
-cast interval wholestage on                         319            326           7         31.3          31.9       1.0X
+cast interval wholestage off                        342            345           5         29.3          34.2       1.0X
+cast interval wholestage on                         329            336          13         30.4          32.9       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 datediff:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-datediff wholestage off                            1762           1764           3          5.7         176.2       1.0X
-datediff wholestage on                             1495           1502           7          6.7         149.5       1.2X
+datediff wholestage off                            1643           1645           3          6.1         164.3       1.0X
+datediff wholestage on                             1645           1657           8          6.1         164.5       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 months_between:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-months_between wholestage off                      1338           1339           1          7.5         133.8       1.0X
-months_between wholestage on                       1334           1339           5          7.5         133.4       1.0X
+months_between wholestage off                      3369           3375           8          3.0         336.9       1.0X
+months_between wholestage on                       3356           3362           5          3.0         335.6       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 window:                                   Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-window wholestage off                              2023           2094         100          0.5        2023.2       1.0X
-window wholestage on                              43505          43551          33          0.0       43504.8       0.0X
+window wholestage off                              2017           2029          18          0.5        2016.8       1.0X
+window wholestage on                              44825          44909          56          0.0       44824.9       0.0X
 
 
 ================================================================================================
 Truncation
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YEAR:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YEAR wholestage off                      660            661           1         15.1          66.0       1.0X
-date_trunc YEAR wholestage on                       589            599           7         17.0          58.9       1.1X
+date_trunc YEAR wholestage off                     1628           1629           0          6.1         162.8       1.0X
+date_trunc YEAR wholestage on                      1589           1599           9          6.3         158.9       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YYYY:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YYYY wholestage off                      656            657           1         15.2          65.6       1.0X
-date_trunc YYYY wholestage on                       593            604          16         16.9          59.3       1.1X
+date_trunc YYYY wholestage off                     1634           1646          17          6.1         163.4       1.0X
+date_trunc YYYY wholestage on                      1596           1606          13          6.3         159.6       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc YY:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc YY wholestage off                        666            669           4         15.0          66.6       1.0X
-date_trunc YY wholestage on                         591            603          19         16.9          59.1       1.1X
+date_trunc YY wholestage off                       1627           1627           0          6.1         162.7       1.0X
+date_trunc YY wholestage on                        1590           1598          10          6.3         159.0       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MON:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MON wholestage off                       592            592           1         16.9          59.2       1.0X
-date_trunc MON wholestage on                        569            580           8         17.6          56.9       1.0X
+date_trunc MON wholestage off                      1633           1634           1          6.1         163.3       1.0X
+date_trunc MON wholestage on                       1602           1607           5          6.2         160.2       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MONTH:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MONTH wholestage off                     593            594           2         16.9          59.3       1.0X
-date_trunc MONTH wholestage on                      575            579           4         17.4          57.5       1.0X
+date_trunc MONTH wholestage off                    1637           1644          11          6.1         163.7       1.0X
+date_trunc MONTH wholestage on                     1601           1609           5          6.2         160.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MM:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MM wholestage off                        589            590           2         17.0          58.9       1.0X
-date_trunc MM wholestage on                         569            575           4         17.6          56.9       1.0X
+date_trunc MM wholestage off                       1639           1653          20          6.1         163.9       1.0X
+date_trunc MM wholestage on                        1602           1606           4          6.2         160.2       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc DAY:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc DAY wholestage off                       438            442           5         22.8          43.8       1.0X
-date_trunc DAY wholestage on                        346            350           4         28.9          34.6       1.3X
+date_trunc DAY wholestage off                      1635           1643          11          6.1         163.5       1.0X
+date_trunc DAY wholestage on                       1511           1517           6          6.6         151.1       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc DD:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc DD wholestage off                        438            439           2         22.8          43.8       1.0X
-date_trunc DD wholestage on                         347            354           7         28.8          34.7       1.3X
+date_trunc DD wholestage off                       1640           1646           8          6.1         164.0       1.0X
+date_trunc DD wholestage on                        1516           1519           4          6.6         151.6       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc HOUR:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc HOUR wholestage off                      384            386           2         26.0          38.4       1.0X
-date_trunc HOUR wholestage on                       357            365           6         28.0          35.7       1.1X
+date_trunc HOUR wholestage off                     1533           1535           4          6.5         153.3       1.0X
+date_trunc HOUR wholestage on                      1532           1538           7          6.5         153.2       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc MINUTE:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc MINUTE wholestage off                    373            375           3         26.8          37.3       1.0X
-date_trunc MINUTE wholestage on                     327            331           5         30.6          32.7       1.1X
+date_trunc MINUTE wholestage off                    381            382           1         26.2          38.1       1.0X
+date_trunc MINUTE wholestage on                     337            343           9         29.7          33.7       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc SECOND:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc SECOND wholestage off                    361            363           3         27.7          36.1       1.0X
-date_trunc SECOND wholestage on                     335            341           8         29.9          33.5       1.1X
+date_trunc SECOND wholestage off                    376            376           1         26.6          37.6       1.0X
+date_trunc SECOND wholestage on                     341            344           6         29.4          34.1       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc WEEK:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc WEEK wholestage off                      515            516           2         19.4          51.5       1.0X
-date_trunc WEEK wholestage on                       455            459           4         22.0          45.5       1.1X
+date_trunc WEEK wholestage off                     1556           1556           0          6.4         155.6       1.0X
+date_trunc WEEK wholestage on                      1498           1507           9          6.7         149.8       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 date_trunc QUARTER:                       Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-date_trunc QUARTER wholestage off                  1337           1341           6          7.5         133.7       1.0X
-date_trunc QUARTER wholestage on                   1328           1334          10          7.5         132.8       1.0X
+date_trunc QUARTER wholestage off                  2340           2346           9          4.3         234.0       1.0X
+date_trunc QUARTER wholestage on                   2309           2315           4          4.3         230.9       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc year:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc year wholestage off                           318            328          14         31.4          31.8       1.0X
-trunc year wholestage on                            297            308          17         33.6          29.7       1.1X
+trunc year wholestage off                           323            323           0         31.0          32.3       1.0X
+trunc year wholestage on                            304            307           4         32.9          30.4       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc yyyy:                               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc yyyy wholestage off                           318            319           2         31.5          31.8       1.0X
-trunc yyyy wholestage on                            296            302          10         33.8          29.6       1.1X
+trunc yyyy wholestage off                           323            324           2         31.0          32.3       1.0X
+trunc yyyy wholestage on                            303            313          12         33.0          30.3       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc yy:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc yy wholestage off                             321            345          35         31.2          32.1       1.0X
-trunc yy wholestage on                              297            319          45         33.6          29.7       1.1X
+trunc yy wholestage off                             325            327           2         30.8          32.5       1.0X
+trunc yy wholestage on                              304            308           3         32.9          30.4       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc mon:                                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc mon wholestage off                            318            318           0         31.5          31.8       1.0X
-trunc mon wholestage on                             299            306           6         33.4          29.9       1.1X
+trunc mon wholestage off                            327            351          34         30.5          32.7       1.0X
+trunc mon wholestage on                             301            302           2         33.3          30.1       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc month:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc month wholestage off                          316            318           3         31.6          31.6       1.0X
-trunc month wholestage on                           296            301           7         33.8          29.6       1.1X
+trunc month wholestage off                          319            319           1         31.4          31.9       1.0X
+trunc month wholestage on                           301            305           4         33.2          30.1       1.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 trunc mm:                                 Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-trunc mm wholestage off                             316            321           8         31.7          31.6       1.0X
-trunc mm wholestage on                              295            302           8         33.9          29.5       1.1X
+trunc mm wholestage off                             319            320           1         31.3          31.9       1.0X
+trunc mm wholestage on                              300            307           7         33.3          30.0       1.1X
 
 
 ================================================================================================
 Parsing
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to timestamp str:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to timestamp str wholestage off                     219            220           1          4.6         219.3       1.0X
-to timestamp str wholestage on                      212            214           2          4.7         212.3       1.0X
+to timestamp str wholestage off                     218            219           2          4.6         217.7       1.0X
+to timestamp str wholestage on                      212            217           5          4.7         212.4       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_timestamp:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_timestamp wholestage off                        1852           1852           1          0.5        1851.9       1.0X
-to_timestamp wholestage on                         1862           1869           9          0.5        1861.6       1.0X
+to_timestamp wholestage off                        2063           2066           5          0.5        2062.5       1.0X
+to_timestamp wholestage on                         2000           2014           8          0.5        2000.3       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_unix_timestamp:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_unix_timestamp wholestage off                   1839           1842           4          0.5        1839.1       1.0X
-to_unix_timestamp wholestage on                    1861           1866           4          0.5        1861.2       1.0X
+to_unix_timestamp wholestage off                   2038           2040           4          0.5        2037.7       1.0X
+to_unix_timestamp wholestage on                    2021           2029          11          0.5        2020.5       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to date str:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to date str wholestage off                          222            228           9          4.5         221.6       1.0X
-to date str wholestage on                           210            211           2          4.8         209.5       1.1X
+to date str wholestage off                          209            215           8          4.8         209.0       1.0X
+to date str wholestage on                           203            205           2          4.9         203.1       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 to_date:                                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-to_date wholestage off                             2386           2392           8          0.4        2386.3       1.0X
-to_date wholestage on                              2438           2457          18          0.4        2437.7       1.0X
+to_date wholestage off                             2191           2201          14          0.5        2191.3       1.0X
+to_date wholestage on                              2124           2137          16          0.5        2124.4       1.0X
 
 
 ================================================================================================
 Conversion from/to external types
 ================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1044-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 To/from java.sql.Timestamp:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-From java.sql.Timestamp                             287            291           7         17.4          57.3       1.0X
-Collect longs                                      1903           2672         694          2.6         380.6       0.2X
-Collect timestamps                                 1544           1644          89          3.2         308.8       0.2X
+From java.sql.Timestamp                             284            289           5         17.6          56.9       1.0X
+Collect longs                                      1413           2057         930          3.5         282.6       0.2X
+Collect timestamps                                 1605           1712          93          3.1         321.0       0.2X
 
 

From 29b3e427794b04a6113f04189cc503fa5bb18666 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 17 Feb 2020 16:05:08 +0900
Subject: [PATCH 063/185] [MINOR] Update the PR template for adding a link to
 the configuration naming guideline

### What changes were proposed in this pull request?

This is a follow-up of #27577. This pr intends to add a link to the configuration naming guideline in `.github/PULL_REQUEST_TEMPLATE`.

### Why are the changes needed?

For reminding developers to follow the naming rules.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27602 from maropu/pr27577-FOLLOWUP.

Authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .github/PULL_REQUEST_TEMPLATE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
index be57f007abbf0..69450add03882 100644
--- a/.github/PULL_REQUEST_TEMPLATE
+++ b/.github/PULL_REQUEST_TEMPLATE
@@ -6,6 +6,8 @@ Thanks for sending a pull request!  Here are some tips for you:
   4. Be sure to keep the PR description updated to reflect all changes.
   5. Please write your PR title to summarize what this PR proposes.
   6. If possible, provide a concise example to reproduce the issue for a faster review.
+  7. If you want to add a new configuration, please read the guideline first for naming configurations in
+     'core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala'.
 -->
 
 ### What changes were proposed in this pull request?

From 446b2d2653a8ae0c9e35799eb4fc1d9f4dcd7991 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Mon, 17 Feb 2020 20:41:56 +0900
Subject: [PATCH 064/185] [SPARK-28869][DOCS][FOLLOWUP] Add direct relationship
 between configs for rolling event log

### What changes were proposed in this pull request?

This patch addresses the post-hoc review comment linked here - https://github.com/apache/spark/pull/25670#discussion_r373304076

### Why are the changes needed?

We would like to explicitly document the direct relationship before we finish up structuring of configurations.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

N/A

Closes #27576 from HeartSaVioR/SPARK-28869-FOLLOWUP-doc.

Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../scala/org/apache/spark/internal/config/package.scala     | 5 +++--
 docs/configuration.md                                        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 02acb6b530737..652db2bdf90a9 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -191,14 +191,15 @@ package object config {
 
   private[spark] val EVENT_LOG_ENABLE_ROLLING =
     ConfigBuilder("spark.eventLog.rolling.enabled")
-      .doc("Whether rolling over event log files is enabled.  If set to true, it cuts down " +
+      .doc("Whether rolling over event log files is enabled. If set to true, it cuts down " +
         "each event log file to the configured size.")
       .booleanConf
       .createWithDefault(false)
 
   private[spark] val EVENT_LOG_ROLLING_MAX_FILE_SIZE =
     ConfigBuilder("spark.eventLog.rolling.maxFileSize")
-      .doc("The max size of event log file to be rolled over.")
+      .doc(s"When ${EVENT_LOG_ENABLE_ROLLING.key}=true, specifies the max size of event log file" +
+        " to be rolled over.")
       .bytesConf(ByteUnit.BYTE)
       .checkValue(_ >= ByteUnit.MiB.toBytes(10), "Max file size of event log should be " +
         "configured to be at least 10 MiB.")
diff --git a/docs/configuration.md b/docs/configuration.md
index 9c9cf84151b20..6aafc9110652b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1033,7 +1033,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.eventLog.rolling.maxFileSize</code></td>
   <td>128m</td>
   <td>
-    The max size of event log file before it's rolled over.
+    When <code>spark.eventLog.rolling.enabled=true</code>, specifies the max size of event log file before it's rolled over.
   </td>
 </tr>
 <tr>

From 5445fe92887d04513d77af0890db572266136220 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Mon, 17 Feb 2020 20:45:24 +0900
Subject: [PATCH 065/185] [SPARK-30827][DOCS] Document direct relationship
 among configurations in "spark.history.*" namespace

### What changes were proposed in this pull request?

This patch adds direct relationship among configurations under "spark.history" namespace.

### Why are the changes needed?

Refer the discussion thread: https://lists.apache.org/thread.html/r43c4e57cace116aca1f0f099e8a577cf202859e3671a04077867b84a%40%3Cdev.spark.apache.org%3E

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Locally ran jekyll and confirmed. Screenshots for the modified spots:

<img width="1159" alt="Screen Shot 2020-02-15 at 8 20 14 PM" src="https://user-images.githubusercontent.com/1317309/74587003-d5922b00-5030-11ea-954b-ee37fc08470a.png">
<img width="1158" alt="Screen Shot 2020-02-15 at 8 20 44 PM" src="https://user-images.githubusercontent.com/1317309/74587005-d62ac180-5030-11ea-98fc-98b1c9d83ff4.png">
<img width="1149" alt="Screen Shot 2020-02-15 at 8 19 56 PM" src="https://user-images.githubusercontent.com/1317309/74587002-d1660d80-5030-11ea-84b5-dec3d7f5c97c.png">

Closes #27575 from HeartSaVioR/SPARK-30827.

Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/internal/config/History.scala |  2 +-
 docs/monitoring.md                             | 18 ++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/History.scala b/core/src/main/scala/org/apache/spark/internal/config/History.scala
index 14fb5ff075472..8f99908507ceb 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/History.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/History.scala
@@ -162,7 +162,7 @@ private[spark] object History {
   val APPLY_CUSTOM_EXECUTOR_LOG_URL_TO_INCOMPLETE_APP =
     ConfigBuilder("spark.history.custom.executor.log.url.applyIncompleteApplication")
       .doc("Whether to apply custom executor log url, as specified by " +
-        "`spark.history.custom.executor.log.url`, to incomplete application as well. " +
+        s"${CUSTOM_EXECUTOR_LOG_URL.key}, to incomplete application as well. " +
         "Even if this is true, this still only affects the behavior of the history server, " +
         "not running spark applications.")
       .booleanConf
diff --git a/docs/monitoring.md b/docs/monitoring.md
index fc95f6a8d3191..3061c6761cdcf 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -159,23 +159,21 @@ Security options for the Spark History Server are covered more detail in the
     <td>false</td>
     <td>
       Indicates whether the history server should use kerberos to login. This is required
-      if the history server is accessing HDFS files on a secure Hadoop cluster. If this is
-      true, it uses the configs <code>spark.history.kerberos.principal</code> and
-      <code>spark.history.kerberos.keytab</code>.
+      if the history server is accessing HDFS files on a secure Hadoop cluster.
     </td>
   </tr>
   <tr>
     <td>spark.history.kerberos.principal</td>
     <td>(none)</td>
     <td>
-      Kerberos principal name for the History Server.
+      When <code>spark.history.kerberos.enabled=true</code>, specifies kerberos principal name for the History Server.
     </td>
   </tr>
   <tr>
     <td>spark.history.kerberos.keytab</td>
     <td>(none)</td>
     <td>
-      Location of the kerberos keytab file for the History Server.
+      When <code>spark.history.kerberos.enabled=true</code>, specifies location of the kerberos keytab file for the History Server.
     </td>
   </tr>
   <tr>
@@ -189,7 +187,7 @@ Security options for the Spark History Server are covered more detail in the
     <td>spark.history.fs.cleaner.interval</td>
     <td>1d</td>
     <td>
-      How often the filesystem job history cleaner checks for files to delete.
+      When <code>spark.history.fs.cleaner.enabled=true</code>, specifies how often the filesystem job history cleaner checks for files to delete.
       Files are deleted if at least one of two conditions holds.
       First, they're deleted if they're older than <code>spark.history.fs.cleaner.maxAge</code>.
       They are also deleted if the number of files is more than
@@ -201,14 +199,14 @@ Security options for the Spark History Server are covered more detail in the
     <td>spark.history.fs.cleaner.maxAge</td>
     <td>7d</td>
     <td>
-      Job history files older than this will be deleted when the filesystem history cleaner runs.
+      When <code>spark.history.fs.cleaner.enabled=true</code>, job history files older than this will be deleted when the filesystem history cleaner runs.
     </td>
   </tr>
   <tr>
     <td>spark.history.fs.cleaner.maxNum</td>
     <td>Int.MaxValue</td>
     <td>
-      The maximum number of files in the event log directory.
+      When <code>spark.history.fs.cleaner.enabled=true</code>, specifies the maximum number of files in the event log directory.
       Spark tries to clean up the completed attempt logs to maintain the log directory under this limit.
       This should be smaller than the underlying file system limit like
       `dfs.namenode.fs-limits.max-directory-items` in HDFS.
@@ -242,7 +240,7 @@ Security options for the Spark History Server are covered more detail in the
     <td>spark.history.fs.driverlog.cleaner.interval</td>
     <td><code>spark.history.fs.cleaner.interval</code></td>
     <td>
-      How often the filesystem driver log cleaner checks for files to delete.
+      When <code>spark.history.fs.driverlog.cleaner.enabled=true</code>, specifies how often the filesystem driver log cleaner checks for files to delete.
       Files are only deleted if they are older than <code>spark.history.fs.driverlog.cleaner.maxAge</code>
     </td>
   </tr>
@@ -250,7 +248,7 @@ Security options for the Spark History Server are covered more detail in the
     <td>spark.history.fs.driverlog.cleaner.maxAge</td>
     <td><code>spark.history.fs.cleaner.maxAge</code></td>
     <td>
-      Driver log files older than this will be deleted when the driver log cleaner runs.
+      When <code>spark.history.fs.driverlog.cleaner.enabled=true</code>, driver log files older than this will be deleted when the driver log cleaner runs.
     </td>
   </tr>
   <tr>

From 5b873420b039f61b85bf349443eb830556fab339 Mon Sep 17 00:00:00 2001
From: Liupengcheng <liupengcheng@xiaomi.com>
Date: Mon, 17 Feb 2020 20:16:31 +0800
Subject: [PATCH 066/185] [SPARK-30346][CORE] Improve logging when events
 dropped

### What changes were proposed in this pull request?

Make logging events dropping every 60s works fine, the orignal implementaion some times not working due to susequent events comming and updating the DroppedEventCounter

### Why are the changes needed?

Currenly, the logging may be skipped and delayed a long time under high concurrency, that make debugging hard. So This PR will try to fix it.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

NA

Closes #27002 from liupc/Improve-logging-dropped-events-and-logging-threadDump.

Authored-by: Liupengcheng <liupengcheng@xiaomi.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/scheduler/AsyncEventQueue.scala     | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
index 1bcddaceb3576..5164c30fce0a1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
@@ -64,11 +64,14 @@ private class AsyncEventQueue(
   // processed (instead of just dequeued).
   private val eventCount = new AtomicLong()
 
-  /** A counter for dropped events. It will be reset every time we log it. */
+  /** A counter for dropped events. */
   private val droppedEventsCounter = new AtomicLong(0L)
 
+  /** A counter to keep number of dropped events last time it was logged */
+  @volatile private var lastDroppedEventsCounter: Long = 0L
+
   /** When `droppedEventsCounter` was logged last time in milliseconds. */
-  @volatile private var lastReportTimestamp = 0L
+  private val lastReportTimestamp = new AtomicLong(0L)
 
   private val logDroppedEvent = new AtomicBoolean(false)
 
@@ -167,21 +170,19 @@ private class AsyncEventQueue(
     }
     logTrace(s"Dropping event $event")
 
-    val droppedCount = droppedEventsCounter.get
-    if (droppedCount > 0) {
-      // Don't log too frequently
-      if (System.currentTimeMillis() - lastReportTimestamp >= 60 * 1000) {
-        // There may be multiple threads trying to decrease droppedEventsCounter.
-        // Use "compareAndSet" to make sure only one thread can win.
-        // And if another thread is increasing droppedEventsCounter, "compareAndSet" will fail and
-        // then that thread will update it.
-        if (droppedEventsCounter.compareAndSet(droppedCount, 0)) {
-          val prevLastReportTimestamp = lastReportTimestamp
-          lastReportTimestamp = System.currentTimeMillis()
-          val previous = new java.util.Date(prevLastReportTimestamp)
-          logWarning(s"Dropped $droppedCount events from $name since " +
-            s"${if (prevLastReportTimestamp == 0) "the application started" else s"$previous"}.")
-        }
+    val droppedEventsCount = droppedEventsCounter.get
+    val droppedCountIncreased = droppedEventsCount - lastDroppedEventsCounter
+    val lastReportTime = lastReportTimestamp.get
+    val curTime = System.currentTimeMillis()
+    // Don't log too frequently
+    if (droppedCountIncreased > 0 && curTime - lastReportTime >= LOGGING_INTERVAL) {
+      // There may be multiple threads trying to logging dropped events,
+      // Use 'compareAndSet' to make sure only one thread can win.
+      if (lastReportTimestamp.compareAndSet(lastReportTime, curTime)) {
+        val previous = new java.util.Date(lastReportTime)
+        lastDroppedEventsCounter = droppedEventsCount
+        logWarning(s"Dropped $droppedCountIncreased events from $name since " +
+          s"${if (lastReportTime == 0) "the application started" else s"$previous"}.")
       }
     }
   }
@@ -213,4 +214,5 @@ private object AsyncEventQueue {
 
   val POISON_PILL = new SparkListenerEvent() { }
 
+  val LOGGING_INTERVAL = 60 * 1000
 }

From ab186e36597e16d48f8eb16d57e7a39b6b67d4e4 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Mon, 17 Feb 2020 22:06:58 +0800
Subject: [PATCH 067/185] [SPARK-25829][SQL] Add config
 `spark.sql.legacy.allowDuplicatedMapKeys` and change the default behavior

### What changes were proposed in this pull request?
This is a follow-up for #23124, add a new config `spark.sql.legacy.allowDuplicatedMapKeys` to control the behavior of removing duplicated map keys in build-in functions. With the default value `false`, Spark will throw a RuntimeException while duplicated keys are found.

### Why are the changes needed?
Prevent silent behavior changes.

### Does this PR introduce any user-facing change?
Yes, new config added and the default behavior for duplicated map keys changed to RuntimeException thrown.

### How was this patch tested?
Modify existing UT.

Closes #27478 from xuanyuanking/SPARK-25892-follow.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                   |   2 +-
 python/pyspark/sql/functions.py               |   4 +-
 .../expressions/collectionOperations.scala    |   4 +-
 .../catalyst/util/ArrayBasedMapBuilder.scala  |  10 ++
 .../apache/spark/sql/internal/SQLConf.scala   |  10 ++
 .../CollectionExpressionsSuite.scala          |  18 ++-
 .../expressions/ComplexTypeSuite.scala        |  35 +++---
 .../HigherOrderFunctionsSuite.scala           |   6 +-
 .../util/ArrayBasedMapBuilderSuite.scala      | 111 ++++++++++--------
 .../spark/sql/DataFrameFunctionsSuite.scala   |  16 ++-
 10 files changed, 136 insertions(+), 80 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 11afae14d0205..1ea45fc1a8a40 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -49,7 +49,7 @@ license: |
 
   - In Spark version 2.4 and earlier, float/double -0.0 is semantically equal to 0.0, but -0.0 and 0.0 are considered as different values when used in aggregate grouping keys, window partition keys and join keys. Since Spark 3.0, this bug is fixed. For example, `Seq(-0.0, 0.0).toDF("d").groupBy("d").count()` returns `[(0.0, 2)]` in Spark 3.0, and `[(0.0, 1), (-0.0, 1)]` in Spark 2.4 and earlier.
 
-  - In Spark version 2.4 and earlier, users can create a map with duplicated keys via built-in functions like `CreateMap`, `StringToMap`, etc. The behavior of map with duplicated keys is undefined, e.g. map look up respects the duplicated key appears first, `Dataset.collect` only keeps the duplicated key appears last, `MapKeys` returns duplicated keys, etc. Since Spark 3.0, these built-in functions will remove duplicated map keys with last wins policy. Users may still read map values with duplicated keys from data sources which do not enforce it (e.g. Parquet), the behavior will be undefined.
+  - In Spark version 2.4 and earlier, users can create a map with duplicated keys via built-in functions like `CreateMap`, `StringToMap`, etc. The behavior of map with duplicated keys is undefined, e.g. map look up respects the duplicated key appears first, `Dataset.collect` only keeps the duplicated key appears last, `MapKeys` returns duplicated keys, etc. Since Spark 3.0, new config `spark.sql.legacy.allowDuplicatedMapKeys` was added, with the default value `false`, Spark will throw RuntimeException while duplicated keys are found. If set to `true`, these built-in functions will remove duplicated map keys with last wins policy. Users may still read map values with duplicated keys from data sources which do not enforce it (e.g. Parquet), the behavior will be undefined.
 
   - In Spark version 2.4 and earlier, partition column value is converted as null if it can't be casted to corresponding user provided schema. Since 3.0, partition column value is validated with user provided schema. An exception is thrown if the validation fails. You can disable such validation by setting `spark.sql.sources.validatePartitionColumns` to `false`.
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e80d556cc89e3..b7ff7264aa183 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2766,12 +2766,12 @@ def map_concat(*cols):
     :param cols: list of column names (string) or list of :class:`Column` expressions
 
     >>> from pyspark.sql.functions import map_concat
-    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c', 1, 'd') as map2")
+    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2")
     >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)
     +------------------------+
     |map3                    |
     +------------------------+
-    |[1 -> d, 2 -> b, 3 -> c]|
+    |[1 -> a, 2 -> b, 3 -> c]|
     +------------------------+
     """
     sc = SparkContext._active_spark_context
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 6ed68e47ce7a1..2b123479a1909 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -516,8 +516,8 @@ case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInp
   usage = "_FUNC_(map, ...) - Returns the union of all the given maps",
   examples = """
     Examples:
-      > SELECT _FUNC_(map(1, 'a', 2, 'b'), map(2, 'c', 3, 'd'));
-       {1:"a",2:"c",3:"d"}
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'), map(3, 'c'));
+       {1:"a",2:"b",3:"c"}
   """, since = "2.4.0")
 case class MapConcat(children: Seq[Expression]) extends ComplexTypeMergingExpression {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
index 37d65309e2b89..40e75b5b6cd25 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.catalyst.util
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.array.ByteArrayMethods
 
@@ -47,6 +49,9 @@ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Seria
   private lazy val keyGetter = InternalRow.getAccessor(keyType)
   private lazy val valueGetter = InternalRow.getAccessor(valueType)
 
+  private val allowDuplicatedMapKey =
+    SQLConf.get.getConf(LEGACY_ALLOW_DUPLICATED_MAP_KEY)
+
   def put(key: Any, value: Any): Unit = {
     if (key == null) {
       throw new RuntimeException("Cannot use null as map key.")
@@ -62,6 +67,11 @@ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Seria
       keys.append(key)
       values.append(value)
     } else {
+      if (!allowDuplicatedMapKey) {
+        throw new RuntimeException(s"Duplicate map key $key was founded, please check the input " +
+          "data. If you want to remove the duplicated keys with last-win policy, you can set " +
+          s"${LEGACY_ALLOW_DUPLICATED_MAP_KEY.key} to true.")
+      }
       // Overwrite the previous value, as the policy is last wins.
       values(index) = value
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 2214e03f34f0b..a985250d7dafa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2188,6 +2188,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val LEGACY_ALLOW_DUPLICATED_MAP_KEY =
+    buildConf("spark.sql.legacy.allowDuplicatedMapKeys")
+      .doc("When true, use last wins policy to remove duplicated map keys in built-in functions, " +
+        "this config takes effect in below build-in functions: CreateMap, MapFromArrays, " +
+        "MapFromEntries, StringToMap, MapConcat and TransformKeys. Otherwise, if this is false, " +
+        "which is the default, Spark will throw an exception when duplicated map keys are " +
+        "detected.")
+      .booleanConf
+      .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 9e98e146c7a0e..01df6675016d2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -139,8 +139,10 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       MapType(IntegerType, IntegerType, valueContainsNull = true))
     val mNull = Literal.create(null, MapType(StringType, StringType))
 
-    // overlapping maps should remove duplicated map keys w.r.t. last win policy.
-    checkEvaluation(MapConcat(Seq(m0, m1)), create_map("a" -> "4", "b" -> "2", "c" -> "3"))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // overlapping maps should remove duplicated map keys w.r.t. last win policy.
+      checkEvaluation(MapConcat(Seq(m0, m1)), create_map("a" -> "4", "b" -> "2", "c" -> "3"))
+    }
 
     // maps with no overlap
     checkEvaluation(MapConcat(Seq(m0, m2)),
@@ -272,8 +274,10 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(MapFromEntries(ai1), create_map(1 -> null, 2 -> 20, 3 -> null))
     checkEvaluation(MapFromEntries(ai2), Map.empty)
     checkEvaluation(MapFromEntries(ai3), null)
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(MapFromEntries(ai4), create_map(1 -> 20))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(MapFromEntries(ai4), create_map(1 -> 20))
+    }
     // Map key can't be null
     checkExceptionInExpression[RuntimeException](
       MapFromEntries(ai5),
@@ -294,8 +298,10 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(MapFromEntries(as1), create_map("a" -> null, "b" -> "bb", "c" -> null))
     checkEvaluation(MapFromEntries(as2), Map.empty)
     checkEvaluation(MapFromEntries(as3), null)
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(MapFromEntries(as4), create_map("a" -> "bb"))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(MapFromEntries(as4), create_map("a" -> "bb"))
+    }
     // Map key can't be null
     checkExceptionInExpression[RuntimeException](
       MapFromEntries(as5),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 9039cd6451590..2c1e0c8460468 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedExtractValue}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -216,10 +217,12 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
       CreateMap(interlace(strWithNull, intSeq.map(Literal(_)))),
       "Cannot use null as map key")
 
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(
-      CreateMap(Seq(Literal(1), Literal(2), Literal(1), Literal(3))),
-      create_map(1 -> 3))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(
+        CreateMap(Seq(Literal(1), Literal(2), Literal(1), Literal(3))),
+        create_map(1 -> 3))
+    }
 
     // ArrayType map key and value
     val map = CreateMap(Seq(
@@ -281,12 +284,14 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
       MapFromArrays(intWithNullArray, strArray),
       "Cannot use null as map key")
 
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(
-      MapFromArrays(
-        Literal.create(Seq(1, 1), ArrayType(IntegerType)),
-        Literal.create(Seq(2, 3), ArrayType(IntegerType))),
-      create_map(1 -> 3))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(
+        MapFromArrays(
+          Literal.create(Seq(1, 1), ArrayType(IntegerType)),
+          Literal.create(Seq(2, 3), ArrayType(IntegerType))),
+        create_map(1 -> 3))
+    }
 
     // map key can't be map
     val arrayOfMap = Seq(create_map(1 -> "a", 2 -> "b"))
@@ -399,10 +404,12 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     val m5 = Map("a" -> null)
     checkEvaluation(new StringToMap(s5), m5)
 
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(
-      new StringToMap(Literal("a:1,b:2,a:3")),
-      create_map("a" -> "3", "b" -> "2"))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(
+        new StringToMap(Literal("a:1,b:2,a:3")),
+        create_map("a" -> "3", "b" -> "2"))
+    }
 
     // arguments checking
     assert(new StringToMap(Literal("a:1,b:2,c:3")).checkInputDataTypes().isSuccess)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HigherOrderFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HigherOrderFunctionsSuite.scala
index e7b713840b884..b3438538a3367 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HigherOrderFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HigherOrderFunctionsSuite.scala
@@ -465,8 +465,10 @@ class HigherOrderFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(
       transformKeys(transformKeys(ai0, plusOne), plusValue),
       create_map(3 -> 1, 5 -> 2, 7 -> 3, 9 -> 4))
-    // Duplicated map keys will be removed w.r.t. the last wins policy.
-    checkEvaluation(transformKeys(ai0, modKey), create_map(1 -> 4, 2 -> 2, 0 -> 3))
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      // Duplicated map keys will be removed w.r.t. the last wins policy.
+      checkEvaluation(transformKeys(ai0, modKey), create_map(1 -> 4, 2 -> 2, 0 -> 3))
+    }
     checkEvaluation(transformKeys(ai1, plusOne), Map.empty[Int, Int])
     checkEvaluation(transformKeys(ai1, plusOne), Map.empty[Int, Int])
     checkEvaluation(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala
index 8509bce177129..87bbdb7300e4c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala
@@ -20,10 +20,12 @@ package org.apache.spark.sql.catalyst.util
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeRow}
+import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, BinaryType, IntegerType, StructType}
 import org.apache.spark.unsafe.Platform
 
-class ArrayBasedMapBuilderSuite extends SparkFunSuite {
+class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper {
 
   test("basic") {
     val builder = new ArrayBasedMapBuilder(IntegerType, IntegerType)
@@ -42,64 +44,79 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite {
     assert(e.getMessage.contains("Cannot use null as map key"))
   }
 
-  test("remove duplicated keys with last wins policy") {
+  test("fail while duplicated keys detected") {
     val builder = new ArrayBasedMapBuilder(IntegerType, IntegerType)
     builder.put(1, 1)
-    builder.put(2, 2)
-    builder.put(1, 2)
-    val map = builder.build()
-    assert(map.numElements() == 2)
-    assert(ArrayBasedMapData.toScalaMap(map) == Map(1 -> 2, 2 -> 2))
+    val e = intercept[RuntimeException](builder.put(1, 2))
+    assert(e.getMessage.contains("Duplicate map key 1 was founded"))
+  }
+
+  test("remove duplicated keys with last wins policy") {
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      val builder = new ArrayBasedMapBuilder(IntegerType, IntegerType)
+      builder.put(1, 1)
+      builder.put(2, 2)
+      builder.put(1, 2)
+      val map = builder.build()
+      assert(map.numElements() == 2)
+      assert(ArrayBasedMapData.toScalaMap(map) == Map(1 -> 2, 2 -> 2))
+    }
   }
 
   test("binary type key") {
-    val builder = new ArrayBasedMapBuilder(BinaryType, IntegerType)
-    builder.put(Array(1.toByte), 1)
-    builder.put(Array(2.toByte), 2)
-    builder.put(Array(1.toByte), 3)
-    val map = builder.build()
-    assert(map.numElements() == 2)
-    val entries = ArrayBasedMapData.toScalaMap(map).iterator.toSeq
-    assert(entries(0)._1.asInstanceOf[Array[Byte]].toSeq == Seq(1))
-    assert(entries(0)._2 == 3)
-    assert(entries(1)._1.asInstanceOf[Array[Byte]].toSeq == Seq(2))
-    assert(entries(1)._2 == 2)
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      val builder = new ArrayBasedMapBuilder(BinaryType, IntegerType)
+      builder.put(Array(1.toByte), 1)
+      builder.put(Array(2.toByte), 2)
+      builder.put(Array(1.toByte), 3)
+      val map = builder.build()
+      assert(map.numElements() == 2)
+      val entries = ArrayBasedMapData.toScalaMap(map).iterator.toSeq
+      assert(entries(0)._1.asInstanceOf[Array[Byte]].toSeq == Seq(1))
+      assert(entries(0)._2 == 3)
+      assert(entries(1)._1.asInstanceOf[Array[Byte]].toSeq == Seq(2))
+      assert(entries(1)._2 == 2)
+    }
   }
 
   test("struct type key") {
-    val builder = new ArrayBasedMapBuilder(new StructType().add("i", "int"), IntegerType)
-    builder.put(InternalRow(1), 1)
-    builder.put(InternalRow(2), 2)
-    val unsafeRow = {
-      val row = new UnsafeRow(1)
-      val bytes = new Array[Byte](16)
-      row.pointTo(bytes, 16)
-      row.setInt(0, 1)
-      row
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      val builder = new ArrayBasedMapBuilder(new StructType().add("i", "int"), IntegerType)
+      builder.put(InternalRow(1), 1)
+      builder.put(InternalRow(2), 2)
+      val unsafeRow = {
+        val row = new UnsafeRow(1)
+        val bytes = new Array[Byte](16)
+        row.pointTo(bytes, 16)
+        row.setInt(0, 1)
+        row
+      }
+      builder.put(unsafeRow, 3)
+      val map = builder.build()
+      assert(map.numElements() == 2)
+      assert(ArrayBasedMapData.toScalaMap(map) == Map(InternalRow(1) -> 3, InternalRow(2) -> 2))
     }
-    builder.put(unsafeRow, 3)
-    val map = builder.build()
-    assert(map.numElements() == 2)
-    assert(ArrayBasedMapData.toScalaMap(map) == Map(InternalRow(1) -> 3, InternalRow(2) -> 2))
   }
 
   test("array type key") {
-    val builder = new ArrayBasedMapBuilder(ArrayType(IntegerType), IntegerType)
-    builder.put(new GenericArrayData(Seq(1, 1)), 1)
-    builder.put(new GenericArrayData(Seq(2, 2)), 2)
-    val unsafeArray = {
-      val array = new UnsafeArrayData()
-      val bytes = new Array[Byte](24)
-      Platform.putLong(bytes, Platform.BYTE_ARRAY_OFFSET, 2)
-      array.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET, 24)
-      array.setInt(0, 1)
-      array.setInt(1, 1)
-      array
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      val builder = new ArrayBasedMapBuilder(ArrayType(IntegerType), IntegerType)
+      builder.put(new GenericArrayData(Seq(1, 1)), 1)
+      builder.put(new GenericArrayData(Seq(2, 2)), 2)
+      val unsafeArray = {
+        val array = new UnsafeArrayData()
+        val bytes = new Array[Byte](24)
+        Platform.putLong(bytes, Platform.BYTE_ARRAY_OFFSET, 2)
+        array.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET, 24)
+        array.setInt(0, 1)
+        array.setInt(1, 1)
+        array
+      }
+      builder.put(unsafeArray, 3)
+      val map = builder.build()
+      assert(map.numElements() == 2)
+      assert(ArrayBasedMapData.toScalaMap(map) ==
+        Map(new GenericArrayData(Seq(1, 1)) -> 3, new GenericArrayData(Seq(2, 2)) -> 2))
     }
-    builder.put(unsafeArray, 3)
-    val map = builder.build()
-    assert(map.numElements() == 2)
-    assert(ArrayBasedMapData.toScalaMap(map) ==
-      Map(new GenericArrayData(Seq(1, 1)) -> 3, new GenericArrayData(Seq(2, 2)) -> 2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index f7531ea446015..341b3255b12bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -651,8 +651,10 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       Row(null)
     )
 
-    checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a)
-    checkAnswer(df1.select(map_concat($"map1", $"map2")), expected1a)
+    withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+      checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a)
+      checkAnswer(df1.select(map_concat($"map1", $"map2")), expected1a)
+    }
 
     val expected1b = Seq(
       Row(Map(1 -> 100, 2 -> 200)),
@@ -3068,11 +3070,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       checkAnswer(dfExample2.select(transform_keys(col("j"), (k, v) => k + v)),
         Seq(Row(Map(2.0 -> 1.0, 3.4 -> 1.4, 4.7 -> 1.7))))
 
-      checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) ->  k % 2 = 0 OR v)"),
-        Seq(Row(Map(true -> true, true -> false))))
+      withSQLConf(SQLConf.LEGACY_ALLOW_DUPLICATED_MAP_KEY.key -> "true") {
+        checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) ->  k % 2 = 0 OR v)"),
+          Seq(Row(Map(true -> true, true -> false))))
 
-      checkAnswer(dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)),
-        Seq(Row(Map(true -> true, true -> false))))
+        checkAnswer(dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)),
+          Seq(Row(Map(true -> true, true -> false))))
+      }
 
       checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> if(v, 2 * k, 3 * k))"),
         Seq(Row(Map(50 -> true, 78 -> false))))

From 06217cfded8d32962e7c54c315f8e684eb9f0999 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Mon, 17 Feb 2020 22:51:56 +0800
Subject: [PATCH 068/185] [SPARK-30793][SQL] Fix truncations of timestamps
 before the epoch to minutes and seconds

### What changes were proposed in this pull request?
In the PR, I propose to replace `%` by `Math.floorMod` in `DateTimeUtils.truncTimestamp` for the `SECOND` and `MINUTE` levels.

### Why are the changes needed?
This fixes the issue of incorrect truncation of timestamps before the epoch `1970-01-01T00:00:00.000000Z` to the `SECOND` and `MINUTE` levels. For example, timestamps after the epoch are truncated by cutting off the rest part of the timestamp:
```sql
spark-sql> select date_trunc('SECOND', '2020-02-11 00:01:02.123');
2020-02-11 00:01:02
```
but seconds in the truncated timestamp before the epoch are increased by 1:
```sql
spark-sql> select date_trunc('SECOND', '1960-02-11 00:01:02.123');
1960-02-11 00:01:03
```

### Does this PR introduce any user-facing change?
Yes. After the changes, the example above outputs correct result:
```sql
spark-sql> select date_trunc('SECOND', '1960-02-11 00:01:02.123');
1960-02-11 00:01:02
```

### How was this patch tested?
Added new tests to `DateFunctionsSuite`.

Closes #27543 from MaxGekk/fix-second-minute-truc.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala     |  4 ++--
 .../org/apache/spark/sql/DateFunctionsSuite.scala   | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index d9a25cf2b10f9..593bd18f3de9c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -718,9 +718,9 @@ object DateTimeUtils {
         val truncated = level match {
           case TRUNC_TO_MILLISECOND => millis
           case TRUNC_TO_SECOND =>
-            millis - millis % MILLIS_PER_SECOND
+            millis - Math.floorMod(millis, MILLIS_PER_SECOND)
           case TRUNC_TO_MINUTE =>
-            millis - millis % MILLIS_PER_MINUTE
+            millis - Math.floorMod(millis, MILLIS_PER_MINUTE)
           case _ => // Try to truncate date levels
             val dDays = millisToDays(millis, zoneId)
             daysToMillis(truncDate(dDays, level), zoneId)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index ba45b9f9b62df..527dfe56c2289 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -869,4 +869,17 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
     checkTrunc("HOUR", "0010-01-01 01:00:00")
     checkTrunc("DAY", "0010-01-01 00:00:00")
   }
+
+  test("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") {
+    def checkTrunc(level: String, expected: String): Unit = {
+      val df = Seq("1961-04-12 00:01:02.345")
+        .toDF()
+        .select($"value".cast("timestamp").as("ts"))
+        .select(date_trunc(level, $"ts").cast("string"))
+      checkAnswer(df, Row(expected))
+    }
+
+    checkTrunc("SECOND", "1961-04-12 00:01:02")
+    checkTrunc("MINUTE", "1961-04-12 00:01:00")
+  }
 }

From 25e9156bc091c087305405c2310e0e85159dc30b Mon Sep 17 00:00:00 2001
From: Arwin Tio <Arwin.tio@adroll.com>
Date: Mon, 17 Feb 2020 09:30:35 -0600
Subject: [PATCH 069/185] [SPARK-29089][SQL] Parallelize blocking FileSystem
 calls in DataSource#checkAndGlobPathIfNecessary

### What changes were proposed in this pull request?
See JIRA: https://issues.apache.org/jira/browse/SPARK-29089
Mailing List: http://apache-spark-developers-list.1001551.n3.nabble.com/DataFrameReader-bottleneck-in-DataSource-checkAndGlobPathIfNecessary-when-reading-S3-files-td27828.html

When using DataFrameReader#csv to read many files on S3, globbing and fs.exists on DataSource#checkAndGlobPathIfNecessary becomes a bottleneck.

From the mailing list discussions, an improvement that can be made is to parallelize the blocking FS calls:

> - have SparkHadoopUtils differentiate between files returned by globStatus(), and which therefore exist, and those which it didn't glob for -it will only need to check those.
> - add parallel execution to the glob and existence checks

### Why are the changes needed?

Verifying/globbing files happens on the driver, and if this operations take a long time (for example against S3), then the entire cluster has to wait, potentially sitting idle. This change hopes to make this process faster.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

I added a test suite `DataSourceSuite` - open to suggestions for better naming.

See [here](https://github.com/apache/spark/pull/25899#issuecomment-534380034) and [here](https://github.com/apache/spark/pull/25899#issuecomment-534069194) for some measurements

Closes #25899 from cozos/master.

Lead-authored-by: Arwin Tio <Arwin.tio@adroll.com>
Co-authored-by: Arwin Tio <arwin.tio@hotmail.com>
Co-authored-by: Arwin Tio <arwin.tio@adroll.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../execution/datasources/DataSource.scala    |  60 ++++--
 .../datasources/DataSourceSuite.scala         | 193 ++++++++++++++++++
 2 files changed, 235 insertions(+), 18 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 3615afcf86c7a..222fea6528261 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -26,6 +26,7 @@ import scala.util.{Failure, Success, Try}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
@@ -50,7 +51,7 @@ import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructField, StructType}
 import org.apache.spark.sql.util.SchemaUtils
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
  * The main class responsible for representing a pluggable Data Source in Spark SQL. In addition to
@@ -739,30 +740,53 @@ object DataSource extends Logging {
    * Checks and returns files in all the paths.
    */
   private[sql] def checkAndGlobPathIfNecessary(
-      paths: Seq[String],
+      pathStrings: Seq[String],
       hadoopConf: Configuration,
       checkEmptyGlobPath: Boolean,
-      checkFilesExist: Boolean): Seq[Path] = {
-    val allGlobPath = paths.flatMap { path =>
-      val hdfsPath = new Path(path)
-      val fs = hdfsPath.getFileSystem(hadoopConf)
-      val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-      val globPath = SparkHadoopUtil.get.globPathIfNecessary(fs, qualified)
-
-      if (checkEmptyGlobPath && globPath.isEmpty) {
-        throw new AnalysisException(s"Path does not exist: $qualified")
+      checkFilesExist: Boolean,
+      numThreads: Integer = 40): Seq[Path] = {
+    val qualifiedPaths = pathStrings.map { pathString =>
+      val path = new Path(pathString)
+      val fs = path.getFileSystem(hadoopConf)
+      path.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    }
+
+    // Split the paths into glob and non glob paths, because we don't need to do an existence check
+    // for globbed paths.
+    val (globPaths, nonGlobPaths) = qualifiedPaths.partition(SparkHadoopUtil.get.isGlobPath)
+
+    val globbedPaths =
+      try {
+        ThreadUtils.parmap(globPaths, "globPath", numThreads) { globPath =>
+          val fs = globPath.getFileSystem(hadoopConf)
+          val globResult = SparkHadoopUtil.get.globPath(fs, globPath)
+
+          if (checkEmptyGlobPath && globResult.isEmpty) {
+            throw new AnalysisException(s"Path does not exist: $globPath")
+          }
+
+          globResult
+        }.flatten
+      } catch {
+        case e: SparkException => throw e.getCause
       }
 
-      // Sufficient to check head of the globPath seq for non-glob scenario
-      // Don't need to check once again if files exist in streaming mode
-      if (checkFilesExist && !fs.exists(globPath.head)) {
-        throw new AnalysisException(s"Path does not exist: ${globPath.head}")
+    if (checkFilesExist) {
+      try {
+        ThreadUtils.parmap(nonGlobPaths, "checkPathsExist", numThreads) { path =>
+          val fs = path.getFileSystem(hadoopConf)
+          if (!fs.exists(path)) {
+            throw new AnalysisException(s"Path does not exist: $path")
+          }
+        }
+      } catch {
+        case e: SparkException => throw e.getCause
       }
-      globPath
     }
 
+    val allPaths = globbedPaths ++ nonGlobPaths
     if (checkFilesExist) {
-      val (filteredOut, filteredIn) = allGlobPath.partition { path =>
+      val (filteredOut, filteredIn) = allPaths.partition { path =>
         InMemoryFileIndex.shouldFilterOut(path.getName)
       }
       if (filteredIn.isEmpty) {
@@ -774,7 +798,7 @@ object DataSource extends Logging {
       }
     }
 
-    allGlobPath
+    allPaths.toSeq
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala
new file mode 100644
index 0000000000000..1e3c660e09454
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.test.SharedSparkSession
+
+class DataSourceSuite extends SharedSparkSession {
+  import TestPaths._
+
+  test("test glob and non glob paths") {
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      Seq(
+        path1.toString,
+        path2.toString,
+        globPath1.toString,
+        globPath2.toString
+      ),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true
+    )
+
+    assert(resultPaths.toSet === allPathsInFs.toSet)
+  }
+
+  test("test glob paths") {
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      Seq(
+        globPath1.toString,
+        globPath2.toString
+      ),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true
+    )
+
+    assert(
+      resultPaths.toSet === Set(
+        globPath1Result1,
+        globPath1Result2,
+        globPath2Result1,
+        globPath2Result2
+      )
+    )
+  }
+
+  test("test non glob paths") {
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      Seq(
+        path1.toString,
+        path2.toString
+      ),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true
+    )
+
+    assert(
+      resultPaths.toSet === Set(
+        path1,
+        path2
+      )
+    )
+  }
+
+  test("test non glob paths checkFilesExist=false") {
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      Seq(
+        path1.toString,
+        path2.toString,
+        nonExistentPath.toString
+      ),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = false
+    )
+
+    assert(
+      resultPaths.toSet === Set(
+        path1,
+        path2,
+        nonExistentPath
+      )
+    )
+  }
+
+  test("test non existent paths") {
+    assertThrows[AnalysisException](
+      DataSource.checkAndGlobPathIfNecessary(
+        Seq(
+          path1.toString,
+          path2.toString,
+          nonExistentPath.toString
+        ),
+        hadoopConf,
+        checkEmptyGlobPath = true,
+        checkFilesExist = true
+      )
+    )
+  }
+
+  test("test non existent glob paths") {
+    assertThrows[AnalysisException](
+      DataSource.checkAndGlobPathIfNecessary(
+        Seq(
+          globPath1.toString,
+          globPath2.toString,
+          nonExistentGlobPath.toString
+        ),
+        hadoopConf,
+        checkEmptyGlobPath = true,
+        checkFilesExist = true
+      )
+    )
+  }
+}
+
+object TestPaths {
+  val hadoopConf = new Configuration()
+  hadoopConf.set("fs.mockFs.impl", classOf[MockFileSystem].getName)
+
+  val path1 = new Path("mockFs://mockFs/somepath1")
+  val path2 = new Path("mockFs://mockFs/somepath2")
+  val globPath1 = new Path("mockFs://mockFs/globpath1*")
+  val globPath2 = new Path("mockFs://mockFs/globpath2*")
+
+  val nonExistentPath = new Path("mockFs://mockFs/nonexistentpath")
+  val nonExistentGlobPath = new Path("mockFs://mockFs/nonexistentpath*")
+
+  val globPath1Result1 = new Path("mockFs://mockFs/globpath1/path1")
+  val globPath1Result2 = new Path("mockFs://mockFs/globpath1/path2")
+  val globPath2Result1 = new Path("mockFs://mockFs/globpath2/path1")
+  val globPath2Result2 = new Path("mockFs://mockFs/globpath2/path2")
+
+  val allPathsInFs = Seq(
+    path1,
+    path2,
+    globPath1Result1,
+    globPath1Result2,
+    globPath2Result1,
+    globPath2Result2
+  )
+
+  val mockGlobResults: Map[Path, Array[FileStatus]] = Map(
+    globPath1 ->
+      Array(
+        createMockFileStatus(globPath1Result1.toString),
+        createMockFileStatus(globPath1Result2.toString)
+      ),
+    globPath2 ->
+      Array(
+        createMockFileStatus(globPath2Result1.toString),
+        createMockFileStatus(globPath2Result2.toString)
+      )
+  )
+
+  def createMockFileStatus(path: String): FileStatus = {
+    val fileStatus = new FileStatus()
+    fileStatus.setPath(new Path(path))
+    fileStatus
+  }
+}
+
+class MockFileSystem extends RawLocalFileSystem {
+  import TestPaths._
+
+  override def exists(f: Path): Boolean = {
+    allPathsInFs.contains(f)
+  }
+
+  override def globStatus(pathPattern: Path): Array[FileStatus] = {
+    mockGlobResults.getOrElse(pathPattern, Array())
+  }
+}

From 0a4080ec3bac4b99c076db62adc23f23039584ce Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Mon, 17 Feb 2020 09:41:38 -0600
Subject: [PATCH 070/185] [SPARK-30736][ML] One-Pass ChiSquareTest

### What changes were proposed in this pull request?
1, distributedly gather matrix `contingency` of each feature
2, distributedly compute the results and then collect them back to the driver

### Why are the changes needed?
existing impl is not efficient:
1, it directly collect matrix `contingency` of partial featues to driver and compute the corresponding result on one pass;
2, a matrix  `contingency` of a featues is of size numDistinctValues X numDistinctLabels, so only 1000 matrices can be collected at a time;

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing testsuites

Closes #27461 from zhengruifeng/chisq_opt.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/mllib/stat/test/ChiSqTest.scala     | 212 +++++++++++++-----
 .../spark/ml/stat/ChiSquareTestSuite.scala    |  34 +++
 2 files changed, 187 insertions(+), 59 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index c2d6b787098dc..a4e398d34347e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.mllib.stat.test
 
-import scala.collection.mutable
-
-import breeze.linalg.{DenseMatrix => BDM}
 import org.apache.commons.math3.distribution.ChiSquaredDistribution
 
 import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
-import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
+import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.collection.OpenHashMap
 
 /**
  * Conduct the chi-squared test for the input RDDs using the specified method.
@@ -50,7 +48,7 @@ private[spark] object ChiSqTest extends Logging {
   case class Method(name: String, chiSqFunc: (Double, Double) => Double)
 
   // Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
-  val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
+  val PEARSON = Method("pearson", (observed: Double, expected: Double) => {
     val dev = observed - expected
     dev * dev / expected
   })
@@ -83,66 +81,162 @@ private[spark] object ChiSqTest extends Logging {
    */
   def chiSquaredFeatures(data: RDD[LabeledPoint],
       methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
-    val numCols = data.first().features.size
-    val results = new Array[ChiSqTestResult](numCols)
-    var labels: Map[Double, Int] = null
-    // at most 1000 columns at a time
-    val batchSize = 1000
-    var batch = 0
-    while (batch * batchSize < numCols) {
-      // The following block of code can be cleaned up and made public as
-      // chiSquared(data: RDD[(V1, V2)])
-      val startCol = batch * batchSize
-      val endCol = startCol + math.min(batchSize, numCols - startCol)
-      val pairCounts = data.mapPartitions { iter =>
-        val distinctLabels = mutable.HashSet.empty[Double]
-        val allDistinctFeatures: Map[Int, mutable.HashSet[Double]] =
-          Map((startCol until endCol).map(col => (col, mutable.HashSet.empty[Double])): _*)
-        var i = 1
-        iter.flatMap { case LabeledPoint(label, features) =>
-          if (i % 1000 == 0) {
-            if (distinctLabels.size > maxCategories) {
-              throw new SparkException(s"Chi-square test expect factors (categorical values) but "
-                + s"found more than $maxCategories distinct label values.")
-            }
-            allDistinctFeatures.foreach { case (col, distinctFeatures) =>
-              if (distinctFeatures.size > maxCategories) {
-                throw new SparkException(s"Chi-square test expect factors (categorical values) but "
-                  + s"found more than $maxCategories distinct values in column $col.")
-              }
-            }
-          }
-          i += 1
-          distinctLabels += label
-          val brzFeatures = features.asBreeze
-          (startCol until endCol).map { col =>
-            val feature = brzFeatures(col)
-            allDistinctFeatures(col) += feature
-            (col, feature, label)
-          }
+    data.first().features match {
+      case dv: DenseVector =>
+        chiSquaredDenseFeatures(data, dv.size, methodName)
+      case sv: SparseVector =>
+        chiSquaredSparseFeatures(data, sv.size, methodName)
+    }
+  }
+
+  private def chiSquaredDenseFeatures(data: RDD[LabeledPoint],
+      numFeatures: Int,
+      methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
+    data.flatMap { case LabeledPoint(label, features) =>
+      require(features.size == numFeatures)
+      features.iterator.map { case (col, value) =>
+        (col, (value, label))
+      }
+    }.aggregateByKey(new OpenHashMap[(Double, Double), Long])(
+      seqOp = { case (count, t) =>
+        count.changeValue(t, 1L, _ + 1L)
+        count
+      },
+      combOp = { case (count1, count2) =>
+        count2.iterator.foreach { case (t, c) =>
+          count1.changeValue(t, c, _ + c)
+        }
+        count1
+      }
+    ).map { case (col, count) =>
+      val label2Index = count.iterator.map(_._1._2).toArray.distinct.sorted.zipWithIndex.toMap
+      val numLabels = label2Index.size
+      if (numLabels > maxCategories) {
+        throw new SparkException(s"Chi-square test expect factors (categorical values) but "
+          + s"found more than $maxCategories distinct label values.")
+      }
+
+      val value2Index = count.iterator.map(_._1._1).toArray.distinct.sorted.zipWithIndex.toMap
+      val numValues = value2Index.size
+      if (numValues > maxCategories) {
+        throw new SparkException(s"Chi-square test expect factors (categorical values) but "
+          + s"found more than $maxCategories distinct values in column $col.")
+      }
+
+      val contingency = new DenseMatrix(numValues, numLabels,
+        Array.ofDim[Double](numValues * numLabels))
+      count.foreach { case ((value, label), c) =>
+        val i = value2Index(value)
+        val j = label2Index(label)
+        contingency.update(i, j, c)
+      }
+
+      val result = ChiSqTest.chiSquaredMatrix(contingency, methodName)
+      (col, result.pValue, result.degreesOfFreedom, result.statistic, result.nullHypothesis)
+    }.collect().sortBy(_._1).map {
+      case (_, pValue, degreesOfFreedom, statistic, nullHypothesis) =>
+        new ChiSqTestResult(pValue, degreesOfFreedom, statistic, methodName, nullHypothesis)
+    }
+  }
+
+  private def chiSquaredSparseFeatures(data: RDD[LabeledPoint],
+      numFeatures: Int,
+      methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
+    val labelCounts = data.map(_.label).countByValue()
+    val numLabels = labelCounts.size
+    if (numLabels > maxCategories) {
+      throw new SparkException(s"Chi-square test expect factors (categorical values) but "
+        + s"found more than $maxCategories distinct label values.")
+    }
+
+    val numInstances = labelCounts.valuesIterator.sum
+    val label2Index = labelCounts.keys.toArray.sorted.zipWithIndex.toMap
+
+    val sc = data.sparkContext
+    val bcLabels = sc.broadcast((labelCounts, label2Index))
+
+    val results = data.flatMap { case LabeledPoint(label, features) =>
+      require(features.size == numFeatures)
+      features.nonZeroIterator.map { case (col, value) =>
+        (col, (value, label))
+      }
+    }.aggregateByKey(new OpenHashMap[(Double, Double), Long])(
+      seqOp = { case (count, t) =>
+        count.changeValue(t, 1L, _ + 1L)
+        count
+      },
+      combOp = { case (count1, count2) =>
+        count2.iterator.foreach { case (t, c) =>
+          count1.changeValue(t, c, _ + c)
         }
-      }.countByValue()
+        count1
+      }
+    ).map { case (col, count) =>
+      val (labelCounts, label2Index) = bcLabels.value
+      val nnz = count.iterator.map(_._2).sum
+      require(numInstances >= nnz)
+
+      val value2Index = if (numInstances == nnz) {
+        count.iterator.map(_._1._1).toArray.distinct.sorted.zipWithIndex.toMap
+      } else {
+        (count.iterator.map(_._1._1).toArray :+ 0.0).distinct.sorted.zipWithIndex.toMap
+      }
+      val numValues = value2Index.size
+      if (numValues > maxCategories) {
+        throw new SparkException(s"Chi-square test expect factors (categorical values) but "
+          + s"found more than $maxCategories distinct values in column $col.")
+      }
 
-      if (labels == null) {
-        // Do this only once for the first column since labels are invariant across features.
-        labels =
-          pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
+      val contingency = new DenseMatrix(numValues, numLabels,
+        Array.ofDim[Double](numValues * numLabels))
+      count.foreach { case ((value, label), c) =>
+        val i = value2Index(value)
+        val j = label2Index(label)
+        contingency.update(i, j, c)
       }
-      val numLabels = labels.size
-      pairCounts.keys.groupBy(_._1).foreach { case (col, keys) =>
-        val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
-        val numRows = features.size
-        val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))
-        keys.foreach { case (_, feature, label) =>
-          val i = features(feature)
-          val j = labels(label)
-          contingency(i, j) += pairCounts((col, feature, label))
+      if (numInstances != nnz) {
+        val nnz = count.iterator
+          .map { case ((_, label), c) => (label, c) }
+          .toArray
+          .groupBy(_._1)
+          .mapValues(_.map(_._2).sum)
+        val i = value2Index(0.0)
+        label2Index.foreach { case (label, j) =>
+          val countByLabel = labelCounts(label)
+          val nnzByLabel = nnz.getOrElse(label, 0L)
+          val nzByLabel = countByLabel - nnzByLabel
+          require(nzByLabel >= 0)
+          if (nzByLabel != 0) contingency.update(i, j, nzByLabel)
         }
-        results(col) = chiSquaredMatrix(Matrices.fromBreeze(contingency), methodName)
       }
-      batch += 1
+
+      val result = ChiSqTest.chiSquaredMatrix(contingency, methodName)
+      (col, (result.pValue, result.degreesOfFreedom, result.statistic, result.nullHypothesis))
+    }.collectAsMap()
+
+    bcLabels.destroy()
+
+    val finalResults = Array.ofDim[ChiSqTestResult](numFeatures)
+    results.foreach { case (col, (pValue, degreesOfFreedom, statistic, nullHypothesis)) =>
+      finalResults(col) = new ChiSqTestResult(pValue, degreesOfFreedom, statistic,
+        methodName, nullHypothesis)
     }
-    results
+
+    if (results.size < numFeatures) {
+      // if some column only contains 0 values
+      val zeroContingency = new DenseMatrix(1, numLabels, Array.ofDim[Double](numLabels))
+      labelCounts.foreach { case (label, c) =>
+        val j = label2Index(label)
+        zeroContingency.update(0, j, c)
+      }
+      val zeroRes = ChiSqTest.chiSquaredMatrix(zeroContingency, methodName)
+
+      Iterator.range(0, numFeatures)
+        .filterNot(results.contains)
+        .foreach (col => finalResults(col) = zeroRes)
+    }
+
+    finalResults
   }
 
   /*
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
index 2d6aad0808bc6..986c7f4c6ae57 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
@@ -55,6 +55,40 @@ class ChiSquareTestSuite
     }
   }
 
+  test("test DataFrame of sparse points") {
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.dense(0.5, 10.0, 0.0, 0.0, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(1.5, 20.0, 0.0, 1.0, 0.0)),
+      LabeledPoint(1.0, Vectors.dense(1.5, 30.0, 0.0, 5.0, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(3.5, 30.0, 0.0, 3.6, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(3.5, 40.0, 0.0, 4.5, 0.0)),
+      LabeledPoint(1.0, Vectors.dense(3.5, 40.0, 0.0, 4.0, 0.0)))
+    val data2 = data.map { case LabeledPoint(label, features) =>
+      LabeledPoint(label, features.toSparse)
+    }
+
+    for (numParts <- List(2, 4, 6, 8)) {
+      val df = spark.createDataFrame(sc.parallelize(data, numParts))
+      val chi = ChiSquareTest.test(df, "features", "label")
+      val res = chi.select("pValues", "degreesOfFreedom", "statistics")
+        .as[(Vector, Array[Int], Vector)]
+        .collect()
+
+      val df2 = spark.createDataFrame(sc.parallelize(data2, numParts))
+      val chi2 = ChiSquareTest.test(df2, "features", "label")
+      val res2 = chi2.select("pValues", "degreesOfFreedom", "statistics")
+        .as[(Vector, Array[Int], Vector)]
+        .collect()
+
+      assert(res.length === res2.length)
+      res.zip(res2).foreach { case (r, r2) =>
+        assert(r._1 ~== r2._1 relTol 1e-6)
+        assert(r._2 === r2._2)
+        assert(r._3 ~== r2._3 relTol 1e-6)
+      }
+    }
+  }
+
   test("large number of features (SPARK-3087)") {
     // Test that the right number of results is returned
     val numCols = 1001

From e4a541b278b47d375252b2d0a8482c053ec3dd3e Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Tue, 18 Feb 2020 00:52:05 +0800
Subject: [PATCH 071/185] [SPARK-30829][SQL] Define LegacyBehaviorPolicy
 enumeration as the common value for result change configs

### What changes were proposed in this pull request?
Define a new enumeration `LegacyBehaviorPolicy` in SQLConf, it will be used as the common value for result change configs.

### Why are the changes needed?
During API auditing for the 3.0 release, we found several new approaches that will change the results silently. For these features, we need a common three-value config.

### Does this PR introduce any user-facing change?
Yes, original config `spark.sql.legacy.ctePrecedence.enabled` change to `spark.sql.legacy.ctePrecedencePolicy`.

### How was this patch tested?
Existing UT.

Closes #27579 from xuanyuanking/SPARK-30829.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/analysis/CTESubstitution.scala   | 23 ++++++++++---------
 .../apache/spark/sql/internal/SQLConf.scala   | 19 ++++++++++-----
 .../resources/sql-tests/inputs/cte-legacy.sql |  2 +-
 .../sql-tests/inputs/cte-nonlegacy.sql        |  2 +-
 .../sql-tests/results/cte-legacy.sql.out      |  4 ++--
 .../resources/sql-tests/results/cte.sql.out   | 12 +++++-----
 6 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
index d2be15d87d023..c8078e10fd044 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -22,21 +22,21 @@ import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED
+import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy}
 
 /**
  * Analyze WITH nodes and substitute child plan with CTE definitions.
  */
 object CTESubstitution extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
-    val isLegacy = SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED)
-    if (isLegacy.isEmpty) {
-      assertNoNameConflictsInCTE(plan, inTraverse = false)
-      traverseAndSubstituteCTE(plan, inTraverse = false)
-    } else if (isLegacy.get) {
-      legacyTraverseAndSubstituteCTE(plan)
-    } else {
-      traverseAndSubstituteCTE(plan, inTraverse = false)
+    LegacyBehaviorPolicy.withName(SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_POLICY)) match {
+      case LegacyBehaviorPolicy.EXCEPTION =>
+        assertNoNameConflictsInCTE(plan, inTraverse = false)
+        traverseAndSubstituteCTE(plan, inTraverse = false)
+      case LegacyBehaviorPolicy.LEGACY =>
+        legacyTraverseAndSubstituteCTE(plan)
+      case LegacyBehaviorPolicy.CORRECTED =>
+        traverseAndSubstituteCTE(plan, inTraverse = false)
     }
   }
 
@@ -54,8 +54,9 @@ object CTESubstitution extends Rule[LogicalPlan] {
           case (cteName, _) =>
             if (cteNames.contains(cteName)) {
               throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
-                s"Please set ${LEGACY_CTE_PRECEDENCE_ENABLED.key} to false so that name defined " +
-                "in inner CTE takes precedence. See more details in SPARK-28228.")
+                s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
+                "defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
+                "definitions will take precedence. See more details in SPARK-28228.")
             } else {
               cteName
             }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a985250d7dafa..700f6b773727b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2106,13 +2106,20 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
-  val LEGACY_CTE_PRECEDENCE_ENABLED = buildConf("spark.sql.legacy.ctePrecedence.enabled")
+  object LegacyBehaviorPolicy extends Enumeration {
+    val EXCEPTION, LEGACY, CORRECTED = Value
+  }
+
+  val LEGACY_CTE_PRECEDENCE_POLICY = buildConf("spark.sql.legacy.ctePrecedencePolicy")
     .internal()
-    .doc("When true, outer CTE definitions takes precedence over inner definitions. If set to " +
-      "false, inner CTE definitions take precedence. The default value is empty, " +
-      "AnalysisException is thrown while name conflict is detected in nested CTE.")
-    .booleanConf
-    .createOptional
+    .doc("When LEGACY, outer CTE definitions takes precedence over inner definitions. If set to " +
+      "CORRECTED, inner CTE definitions take precedence. The default value is EXCEPTION, " +
+      "AnalysisException is thrown while name conflict is detected in nested CTE. This config " +
+      "will be removed in future versions and CORRECTED will be the only behavior.")
+    .stringConf
+    .transform(_.toUpperCase(Locale.ROOT))
+    .checkValues(LegacyBehaviorPolicy.values.map(_.toString))
+    .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString)
 
   val LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC =
     buildConf("spark.sql.legacy.arrayExistsFollowsThreeValuedLogic")
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
index 2f2606d44d910..d8754d3023965 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
@@ -2,7 +2,7 @@ create temporary view t as select * from values 0, 1, 2 as t(id);
 create temporary view t2 as select * from values 0, 1 as t(id);
 
 -- CTE legacy substitution
-SET spark.sql.legacy.ctePrecedence.enabled=true;
+SET spark.sql.legacy.ctePrecedencePolicy=legacy;
 
 -- CTE in CTE definition
 WITH t as (
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
index b711bf338ab08..8b9ce8911b133 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
@@ -1,2 +1,2 @@
---SET spark.sql.legacy.ctePrecedence.enabled = false
+--SET spark.sql.legacy.ctePrecedencePolicy = corrected
 --IMPORT cte.sql
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
index a9709c4a79793..a20ae55b1b876 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -19,11 +19,11 @@ struct<>
 
 
 -- !query
-SET spark.sql.legacy.ctePrecedence.enabled=true
+SET spark.sql.legacy.ctePrecedencePolicy=legacy
 -- !query schema
 struct<key:string,value:string>
 -- !query output
-spark.sql.legacy.ctePrecedence.enabled	true
+spark.sql.legacy.ctePrecedencePolicy	legacy
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index 1d50aa8f57505..6486fba18d20f 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -207,7 +207,7 @@ SELECT * FROM t2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query
@@ -226,7 +226,7 @@ SELECT * FROM t2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query
@@ -245,7 +245,7 @@ SELECT * FROM t2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query
@@ -299,7 +299,7 @@ SELECT (
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query
@@ -314,7 +314,7 @@ SELECT (
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query
@@ -330,7 +330,7 @@ SELECT (
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.;
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
 
 
 -- !query

From 0ae3ff60c4679495d376d2418450cfe93cac5590 Mon Sep 17 00:00:00 2001
From: "wangguangxin.cn" <wangguangxin.cn@gmail.com>
Date: Mon, 17 Feb 2020 18:15:54 +0100
Subject: [PATCH 072/185] [SPARK-30806][SQL] Evaluate once per group in
 UnboundedWindowFunctionFrame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
We only need to do aggregate evaluation once per group in `UnboundedWindowFunctionFrame`

### Why are the changes needed?
Currently, in `UnboundedWindowFunctionFrame.write`，it re-evaluate the processor for each row in a group, which is not necessary in fact which I'll address later. It hurts performance when the evaluation is time-consuming (for example, Percentile's eval need to sort its buffer and do some calculation). In our production, there is a percentile with window operation sql,  it costs more than 10 hours in SparkSQL while 10min in Hive.

In fact, `UnboundedWindowFunctionFrame` can be treated as `SlidingWindowFunctionFrame` with `lbound = UnboundedPreceding` and `ubound = UnboundedFollowing`, just as its comments. In that case, `SlidingWindowFunctionFrame` also only do evaluation once for each group.

The performance issue can be reproduced by running the follow scripts in local spark-shell
```
spark.range(100*100).map(i => (i, "India")).toDF("uv", "country").createOrReplaceTempView("test")
sql("select uv, country, percentile(uv, 0.95) over (partition by country) as ptc95 from test").collect.foreach(println)
```
Before this patch, the sql costs **128048 ms**.
With this patch,  the sql costs **3485 ms**.

If we increase the data size to 1000*1000 for example, then spark cannot even produce result without this patch(I'v waited for several hours).

### Does this PR introduce any user-facing change?
NO

### How was this patch tested?
Existing UT

Closes #27558 from WangGuangxin/windows.

Authored-by: wangguangxin.cn <wangguangxin.cn@gmail.com>
Signed-off-by: herman <herman@databricks.com>
---
 .../spark/sql/execution/window/WindowFunctionFrame.scala | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
index d5f2ffa5573a9..181ff42421c65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
@@ -277,6 +277,8 @@ final class UnboundedWindowFunctionFrame(
       while (iterator.hasNext) {
         processor.update(iterator.next())
       }
+
+      processor.evaluate(target)
     }
 
     upperBound = rows.length
@@ -284,11 +286,8 @@ final class UnboundedWindowFunctionFrame(
 
   /** Write the frame columns for the current row to the given target row. */
   override def write(index: Int, current: InternalRow): Unit = {
-    // Unfortunately we cannot assume that evaluation is deterministic. So we need to re-evaluate
-    // for each row.
-    if (processor != null) {
-      processor.evaluate(target)
-    }
+    // The results are the same for each row in the partition, and have been evaluated in prepare.
+    // Don't need to recalculate here.
   }
 
   override def currentLowerBound(): Int = lowerBound

From 76ddb6d835e773d87d93fd2dac008ace1f756d3b Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Mon, 17 Feb 2020 09:26:56 -0800
Subject: [PATCH 073/185] [SPARK-30755][SQL] Update migration guide and add
 actionable exception for HIVE-15167

### What changes were proposed in this pull request?
[HIVE-15167](https://issues.apache.org/jira/browse/HIVE-15167) removed the `SerDe` interface. This may break custom `SerDe` builds for Hive 1.2. This PR update the migration guide for this change.

### Why are the changes needed?

Otherwise:
```
2020-01-27 05:11:20.446 - stderr> 20/01/27 05:11:20 INFO DAGScheduler: ResultStage 2 (main at NativeMethodAccessorImpl.java:0) failed in 1.000 s due to Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 13, 10.110.21.210, executor 1): java.lang.NoClassDefFoundError: org/apache/hadoop/hive/serde2/SerDe
  2020-01-27 05:11:20.446 - stderr>  at java.lang.ClassLoader.defineClass1(Native Method)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.ClassLoader.defineClass(ClassLoader.java:756)
  2020-01-27 05:11:20.446 - stderr>  at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
  2020-01-27 05:11:20.446 - stderr>  at java.net.URLClassLoader.defineClass(URLClassLoader.java:468)
  2020-01-27 05:11:20.446 - stderr>  at java.net.URLClassLoader.access$100(URLClassLoader.java:74)
  2020-01-27 05:11:20.446 - stderr>  at java.net.URLClassLoader$1.run(URLClassLoader.java:369)
  2020-01-27 05:11:20.446 - stderr>  at java.net.URLClassLoader$1.run(URLClassLoader.java:363)
  2020-01-27 05:11:20.446 - stderr>  at java.security.AccessController.doPrivileged(Native Method)
  2020-01-27 05:11:20.446 - stderr>  at java.net.URLClassLoader.findClass(URLClassLoader.java:362)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
  2020-01-27 05:11:20.446 - stderr>  at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.ClassLoader.loadClass(ClassLoader.java:405)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.Class.forName0(Native Method)
  2020-01-27 05:11:20.446 - stderr>  at java.lang.Class.forName(Class.java:348)
  2020-01-27 05:11:20.446 - stderr>  at org.apache.hadoop.hive.ql.plan.TableDesc.getDeserializerClass(TableDesc.java:76)
.....
```

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Manual test

Closes #27492 from wangyum/SPARK-30755.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 docs/sql-migration-guide.md                            |  2 ++
 .../apache/spark/sql/hive/client/HiveClientImpl.scala  | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 1ea45fc1a8a40..54a1fbf2cac8e 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -332,6 +332,8 @@ license: |
 
   - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3. This may need to set `spark.sql.hive.metastore.version` and `spark.sql.hive.metastore.jars` according to the version of the Hive metastore.
   For example: set `spark.sql.hive.metastore.version` to `1.2.1` and `spark.sql.hive.metastore.jars` to `maven` if your Hive metastore version is 1.2.1.
+  
+  - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3. You need to migrate your custom SerDes to Hive 2.3 or build your own Spark with `hive-1.2` profile. See HIVE-15167 for more details.
 
 ## Upgrading from Spark SQL 2.4.4 to 2.4.5
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index b5c5f0e9381bc..ab4db0cf36242 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -318,7 +318,15 @@ private[hive] class HiveClientImpl(
     // with the HiveConf in `state` to override the context class loader of the current
     // thread.
     shim.setCurrentSessionState(state)
-    val ret = try f finally {
+    val ret = try {
+      f
+    } catch {
+      case e: NoClassDefFoundError
+        if HiveUtils.isHive23 && e.getMessage.contains("org/apache/hadoop/hive/serde2/SerDe") =>
+        throw new ClassNotFoundException("The SerDe interface removed since Hive 2.3(HIVE-15167)." +
+          " Please migrate your custom SerDes to Hive 2.3 or build your own Spark with" +
+          " hive-1.2 profile. See HIVE-15167 for more details.", e)
+    } finally {
       state.getConf.setClassLoader(originalConfLoader)
       Thread.currentThread().setContextClassLoader(original)
       HiveCatalogMetrics.incrementHiveClientCalls(1)

From 5ffc5ff55e86c180ad8a7fafb0e49e268bbfbca5 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Tue, 18 Feb 2020 01:44:14 +0800
Subject: [PATCH 074/185] [SPARK-11150][SQL][FOLLOWUP] Move sql/dynamicpruning
 to sql/execution/dynamicpruning

### What changes were proposed in this pull request?
Follow-up work for #25600. In this PR, we move `sql/dynamicpruning` to `sql/execution/dynamicpruning`.

### Why are the changes needed?
Fix the unexpected public APIs in 3.0.0 #27560.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing UT.

Closes #27581 from xuanyuanking/SPARK-11150-follow.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/execution/QueryExecution.scala   | 2 +-
 .../scala/org/apache/spark/sql/execution/SparkOptimizer.scala   | 2 +-
 .../dynamicpruning/CleanupDynamicPruningFilters.scala           | 2 +-
 .../sql/{ => execution}/dynamicpruning/PartitionPruning.scala   | 2 +-
 .../dynamicpruning/PlanDynamicPruningFilters.scala              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/{ => execution}/dynamicpruning/CleanupDynamicPruningFilters.scala (97%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{ => execution}/dynamicpruning/PartitionPruning.scala (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{ => execution}/dynamicpruning/PlanDynamicPruningFilters.scala (98%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 9109c05e75853..4571e69b302f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -32,8 +32,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
+import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
 import org.apache.spark.sql.internal.SQLConf
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 013d94768a2a8..33b86a2b5340c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -23,10 +23,10 @@ import org.apache.spark.sql.catalyst.optimizer._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.catalog.CatalogManager
-import org.apache.spark.sql.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning}
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.datasources.SchemaPruning
 import org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown
+import org.apache.spark.sql.execution.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning}
 import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs}
 
 class SparkOptimizer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala
similarity index 97%
rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala
index 84be2c9cf5a21..1cf55a2f2954d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.dynamicpruning
+package org.apache.spark.sql.execution.dynamicpruning
 
 import org.apache.spark.sql.catalyst.expressions.{DynamicPruning, PredicateHelper}
 import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
index 28f8f49d2ce44..43c6581632687 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.dynamicpruning
+package org.apache.spark.sql.execution.dynamicpruning
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala
similarity index 98%
rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala
index be00f728aa3ca..eb091758910cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.dynamicpruning
+package org.apache.spark.sql.execution.dynamicpruning
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions

From a1d536cb3e34b74b433d31c475058f38520ab399 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Tue, 18 Feb 2020 01:52:23 +0800
Subject: [PATCH 075/185] [SPARK-15616][FOLLOW-UP][SQL] Sub Optimizer should
 include super.postHocOptimizationBatches

### What changes were proposed in this pull request?

Let sub optimizer's  `postHocOptimizationBatches` also includes super's `postHocOptimizationBatches`.

### Why are the changes needed?

It's necessary according to the design of catalyst optimizer.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass jenkins.

Closes #27607 from Ngone51/spark_15616_followup.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/hive/HiveSessionStateBuilder.scala  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 8020492ff759d..1858fbd910666 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -101,9 +101,8 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
    */
   override protected def optimizer: Optimizer = {
     new SparkOptimizer(catalogManager, catalog, experimentalMethods) {
-      override def postHocOptimizationBatches: Seq[Batch] = Seq(
-        Batch("Prune Hive Table Partitions", Once, new PruneHiveTablePartitions(session))
-      )
+      override def postHocOptimizationBatches: Seq[Batch] = super.postHocOptimizationBatches ++
+        Seq(Batch("Prune Hive Table Partitions", Once, new PruneHiveTablePartitions(session)))
 
       override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] =
         super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules

From afaeb29599593f021c9ea47e52f8c70013a4afef Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Tue, 18 Feb 2020 02:15:44 +0800
Subject: [PATCH 076/185] [SPARK-30808][SQL] Enable Java 8 time API in Thrift
 server

### What changes were proposed in this pull request?
- Set `spark.sql.datetime.java8API.enabled` to `true` in `hiveResultString()`, and restore it back at the end of the call.
- Convert collected `java.time.Instant` & `java.time.LocalDate` to `java.sql.Timestamp` and `java.sql.Date` for correct formatting.

### Why are the changes needed?
Because of textual representation of timestamps/dates before 1582 year is incorrect:
```shell
$ export TZ="America/Los_Angeles"
$ ./bin/spark-sql -S
```
```sql
spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone	America/Los_Angeles
spark-sql> SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20');
1001-01-01 00:07:02
```
It must be 1001-01-01 00:**00:00**.

### Does this PR introduce any user-facing change?
Yes. After the changes:
```shell
$ export TZ="America/Los_Angeles"
$ ./bin/spark-sql -S
```
```sql
spark-sql> set spark.sql.session.timeZone=America/Los_Angeles;
spark.sql.session.timeZone	America/Los_Angeles
spark-sql> SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20');
1001-01-01 00:00:00
```

### How was this patch tested?
By running hive-thiftserver tests. In particular:
```
./build/sbt -Phadoop-2.7 -Phive-2.3 -Phive-thriftserver "hive-thriftserver/test:testOnly *SparkThriftServerProtocolVersionsSuite"
```

Closes #27552 from MaxGekk/hive-thriftserver-java8-time-api.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/HiveResult.scala      | 61 ++++++++++++-------
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  3 +-
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  3 +-
 .../spark/sql/execution/HiveResultSuite.scala | 25 ++++----
 .../SparkExecuteStatementOperation.scala      | 10 ++-
 .../hive/thriftserver/SparkSQLDriver.scala    |  5 +-
 .../hive/execution/HiveComparisonTest.scala   |  4 +-
 7 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
index 5a2f16d8e1526..b19184055268a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
@@ -21,9 +21,10 @@ import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import java.time.{Instant, LocalDate}
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter}
 import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -36,27 +37,43 @@ object HiveResult {
    * Returns the result as a hive compatible sequence of strings. This is used in tests and
    * `SparkSQLDriver` for CLI applications.
    */
-  def hiveResultString(executedPlan: SparkPlan): Seq[String] = executedPlan match {
-    case ExecutedCommandExec(_: DescribeCommandBase) =>
-      // If it is a describe command for a Hive table, we want to have the output format
-      // be similar with Hive.
-      executedPlan.executeCollectPublic().map {
-        case Row(name: String, dataType: String, comment) =>
-          Seq(name, dataType,
-            Option(comment.asInstanceOf[String]).getOrElse(""))
-            .map(s => String.format(s"%-20s", s))
-            .mkString("\t")
-      }
-    // SHOW TABLES in Hive only output table names, while ours output database, table name, isTemp.
-    case command @ ExecutedCommandExec(s: ShowTablesCommand) if !s.isExtended =>
-      command.executeCollect().map(_.getString(1))
-    case other =>
-      val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq
-      // We need the types so we can output struct field names
-      val types = executedPlan.output.map(_.dataType)
-      // Reformat to match hive tab delimited output.
-      result.map(_.zip(types).map(e => toHiveString(e)))
-        .map(_.mkString("\t"))
+  def hiveResultString(ds: Dataset[_]): Seq[String] = {
+    val executedPlan = ds.queryExecution.executedPlan
+    executedPlan match {
+      case ExecutedCommandExec(_: DescribeCommandBase) =>
+        // If it is a describe command for a Hive table, we want to have the output format
+        // be similar with Hive.
+        executedPlan.executeCollectPublic().map {
+          case Row(name: String, dataType: String, comment) =>
+            Seq(name, dataType,
+              Option(comment.asInstanceOf[String]).getOrElse(""))
+              .map(s => String.format(s"%-20s", s))
+              .mkString("\t")
+        }
+      // SHOW TABLES in Hive only output table names,
+      // while ours output database, table name, isTemp.
+      case command @ ExecutedCommandExec(s: ShowTablesCommand) if !s.isExtended =>
+        command.executeCollect().map(_.getString(1))
+      case _ =>
+        val sessionWithJava8DatetimeEnabled = {
+          val cloned = ds.sparkSession.cloneSession()
+          cloned.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, true)
+          cloned
+        }
+        sessionWithJava8DatetimeEnabled.withActive {
+          // We cannot collect the original dataset because its encoders could be created
+          // with disabled Java 8 date-time API.
+          val result: Seq[Seq[Any]] = Dataset.ofRows(ds.sparkSession, ds.logicalPlan)
+            .queryExecution
+            .executedPlan
+            .executeCollectPublic().map(_.toSeq).toSeq
+          // We need the types so we can output struct field names
+          val types = executedPlan.output.map(_.dataType)
+          // Reformat to match hive tab delimited output.
+          result.map(_.zip(types).map(e => toHiveString(e)))
+            .map(_.mkString("\t"))
+        }
+    }
   }
 
   private lazy val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 11f9724e587f2..d6efb2f91ca0b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -189,8 +189,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
           example.split("  > ").toList.foreach(_ match {
             case exampleRe(sql, output) =>
               val df = clonedSpark.sql(sql)
-              val actual = unindentAndTrim(
-                hiveResultString(df.queryExecution.executedPlan).mkString("\n"))
+              val actual = unindentAndTrim(hiveResultString(df).mkString("\n"))
               val expected = unindentAndTrim(output)
               assert(actual === expected)
             case _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 83285911b3948..34829f18bfed2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -378,7 +378,6 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
         localSparkSession.conf.set(SQLConf.ANSI_ENABLED.key, true)
       case _ =>
     }
-    localSparkSession.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, true)
 
     if (configSet.nonEmpty) {
       // Execute the list of set operation in order to add the desired configs
@@ -512,7 +511,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
     val schema = df.schema.catalogString
     // Get answer, but also get rid of the #1234 expression ids that show up in explain plans
     val answer = SQLExecution.withNewExecutionId(df.queryExecution, Some(sql)) {
-      hiveResultString(df.queryExecution.executedPlan).map(replaceNotIncludedMsg)
+      hiveResultString(df).map(replaceNotIncludedMsg)
     }
 
     // If the output is not pre-sorted, sort it.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
index bb59b12e6f350..bddd15c6e25d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
@@ -25,11 +25,10 @@ class HiveResultSuite extends SharedSparkSession {
   test("date formatting in hive result") {
     val dates = Seq("2018-12-28", "1582-10-13", "1582-10-14", "1582-10-15")
     val df = dates.toDF("a").selectExpr("cast(a as date) as b")
-    val executedPlan1 = df.queryExecution.executedPlan
-    val result = HiveResult.hiveResultString(executedPlan1)
+    val result = HiveResult.hiveResultString(df)
     assert(result == dates)
-    val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
-    val result2 = HiveResult.hiveResultString(executedPlan2)
+    val df2 = df.selectExpr("array(b)")
+    val result2 = HiveResult.hiveResultString(df2)
     assert(result2 == dates.map(x => s"[$x]"))
   }
 
@@ -40,11 +39,10 @@ class HiveResultSuite extends SharedSparkSession {
       "1582-10-14 01:02:03",
       "1582-10-15 01:02:03")
     val df = timestamps.toDF("a").selectExpr("cast(a as timestamp) as b")
-    val executedPlan1 = df.queryExecution.executedPlan
-    val result = HiveResult.hiveResultString(executedPlan1)
+    val result = HiveResult.hiveResultString(df)
     assert(result == timestamps)
-    val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
-    val result2 = HiveResult.hiveResultString(executedPlan2)
+    val df2 = df.selectExpr("array(b)")
+    val result2 = HiveResult.hiveResultString(df2)
     assert(result2 == timestamps.map(x => s"[$x]"))
   }
 
@@ -57,15 +55,14 @@ class HiveResultSuite extends SharedSparkSession {
   test("decimal formatting in hive result") {
     val df = Seq(new java.math.BigDecimal("1")).toDS()
     Seq(2, 6, 18).foreach { scala =>
-      val executedPlan =
-        df.selectExpr(s"CAST(value AS decimal(38, $scala))").queryExecution.executedPlan
-      val result = HiveResult.hiveResultString(executedPlan)
+      val decimalDf = df.selectExpr(s"CAST(value AS decimal(38, $scala))")
+      val result = HiveResult.hiveResultString(decimalDf)
       assert(result.head.split("\\.").last.length === scala)
     }
 
-    val executedPlan = Seq(java.math.BigDecimal.ZERO).toDS()
-      .selectExpr(s"CAST(value AS decimal(38, 8))").queryExecution.executedPlan
-    val result = HiveResult.hiveResultString(executedPlan)
+    val df2 = Seq(java.math.BigDecimal.ZERO).toDS()
+      .selectExpr(s"CAST(value AS decimal(38, 8))")
+    val result = HiveResult.hiveResultString(df2)
     assert(result.head === "0.00000000")
   }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index cf0e5ebf3a2b1..7bcd8032bd6af 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.thriftserver
 
 import java.security.PrivilegedExceptionAction
 import java.sql.{Date, Timestamp}
+import java.time.{Instant, LocalDate}
 import java.util.{Arrays, Map => JMap, UUID}
 import java.util.concurrent.RejectedExecutionException
 
@@ -178,7 +179,14 @@ private[hive] class SparkExecuteStatementOperation(
           }
           curCol += 1
         }
-        resultRowSet.addRow(row.toArray.asInstanceOf[Array[Object]])
+        // Convert date-time instances to types that are acceptable by Hive libs
+        // used in conversions to strings.
+        val resultRow = row.map {
+          case i: Instant => Timestamp.from(i)
+          case ld: LocalDate => Date.valueOf(ld)
+          case other => other
+        }.toArray.asInstanceOf[Array[Object]]
+        resultRowSet.addRow(resultRow)
         curRow += 1
         resultOffset += 1
       }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 12fba0eae6dce..64e91f405d613 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -60,9 +60,10 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont
     // TODO unify the error code
     try {
       context.sparkContext.setJobDescription(command)
-      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
+      val df = context.sql(command)
+      val execution = df.queryExecution
       hiveResponse = SQLExecution.withNewExecutionId(execution) {
-        hiveResultString(execution.executedPlan)
+        hiveResultString(df)
       }
       tableSchema = getResultSetSchema(execution)
       new CommandProcessorResponse(0)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 8b1f4c92755b9..82fe274515505 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -346,7 +346,9 @@ abstract class HiveComparisonTest
         val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) =>
           val query = new TestHiveQueryExecution(queryString.replace("../../data", testDataPath))
           def getResult(): Seq[String] = {
-            SQLExecution.withNewExecutionId(query)(hiveResultString(query.executedPlan))
+            SQLExecution.withNewExecutionId(query) {
+              hiveResultString(Dataset.ofRows(query.sparkSession, query.logical))
+            }
           }
           try { (query, prepareAnswer(query, getResult())) } catch {
             case e: Throwable =>

From 2854091d12d670b014c41713e72153856f4d3f6a Mon Sep 17 00:00:00 2001
From: Ajith <ajith2489@gmail.com>
Date: Tue, 18 Feb 2020 02:26:52 +0800
Subject: [PATCH 077/185] [SPARK-22590][SQL] Copy sparkContext.localproperties
 to child thread in BroadcastExchangeExec.executionContext

### What changes were proposed in this pull request?
In `org.apache.spark.sql.execution.exchange.BroadcastExchangeExec#relationFuture` make a copy of `org.apache.spark.SparkContext#localProperties` and pass it to the broadcast execution thread in `org.apache.spark.sql.execution.exchange.BroadcastExchangeExec#executionContext`

### Why are the changes needed?
When executing `BroadcastExchangeExec`, the relationFuture is evaluated via a separate thread. The threads inherit the `localProperties` from `sparkContext` as they are the child threads.
These threads are created in the executionContext (thread pools). Each Thread pool has a default `keepAliveSeconds` of 60 seconds for idle threads.
Scenarios where the thread pool has threads which are idle and reused for a subsequent new query, the thread local properties will not be inherited from spark context (thread properties are inherited only on thread creation) hence end up having old or no properties set. This will cause taskset properties to be missing when properties are transferred by child thread via `sparkContext.runJob/submitJob`

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Added UT

Closes #27266 from ajithme/broadcastlocalprop.

Authored-by: Ajith <ajith2489@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/ThreadUtils.scala   | 17 +++++++++++
 .../spark/sql/execution/SQLExecution.scala    | 10 +++----
 .../execution/basicPhysicalOperators.scala    |  5 ++--
 .../exchange/BroadcastExchangeExec.scala      | 16 +++--------
 .../internal/ExecutorSideSQLConfSuite.scala   | 28 +++++++++++++++++++
 5 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index de39e4b410f25..e7872bb9cb6b0 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.util
 
 import java.util.concurrent._
+import java.util.concurrent.{Future => JFuture}
 import java.util.concurrent.locks.ReentrantLock
 
 import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future}
@@ -304,6 +305,22 @@ private[spark] object ThreadUtils {
   }
   // scalastyle:on awaitresult
 
+  @throws(classOf[SparkException])
+  def awaitResult[T](future: JFuture[T], atMost: Duration): T = {
+    try {
+      atMost match {
+        case Duration.Inf => future.get()
+        case _ => future.get(atMost._1, atMost._2)
+      }
+    } catch {
+      case e: SparkFatalException =>
+        throw e.throwable
+      case NonFatal(t)
+        if !t.isInstanceOf[TimeoutException] && !t.isInstanceOf[RpcAbortException] =>
+        throw new SparkException("Exception thrown in awaitResult: ", t)
+    }
+  }
+
   // scalastyle:off awaitready
   /**
    * Preferred alternative to `Await.ready()`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index 59c503e372535..5e4f30a5edaf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.sql.execution
 
-import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future => JFuture}
 import java.util.concurrent.atomic.AtomicLong
 
-import scala.concurrent.{ExecutionContext, Future}
-
 import org.apache.spark.SparkContext
 import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.sql.SparkSession
@@ -172,11 +170,11 @@ object SQLExecution {
    * SparkContext local properties are forwarded to execution thread
    */
   def withThreadLocalCaptured[T](
-      sparkSession: SparkSession, exec: ExecutionContext)(body: => T): Future[T] = {
+      sparkSession: SparkSession, exec: ExecutorService) (body: => T): JFuture[T] = {
     val activeSession = sparkSession
     val sc = sparkSession.sparkContext
     val localProps = Utils.cloneProperties(sc.getLocalProperties)
-    Future {
+    exec.submit(() => {
       val originalSession = SparkSession.getActiveSession
       val originalLocalProps = sc.getLocalProperties
       SparkSession.setActiveSession(activeSession)
@@ -190,6 +188,6 @@ object SQLExecution {
         SparkSession.clearActiveSession()
       }
       res
-    }(exec)
+    })
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index c35c48496e1c9..4e65bbf75282f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.execution
 
+import java.util.concurrent.{Future => JFuture}
 import java.util.concurrent.TimeUnit._
 
 import scala.collection.mutable
-import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.{ExecutionContext}
 import scala.concurrent.duration.Duration
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
@@ -746,7 +747,7 @@ case class SubqueryExec(name: String, child: SparkPlan)
     "collectTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to collect"))
 
   @transient
-  private lazy val relationFuture: Future[Array[InternalRow]] = {
+  private lazy val relationFuture: JFuture[Array[InternalRow]] = {
     // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here.
     val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
     SQLExecution.withThreadLocalCaptured[Array[InternalRow]](
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
index 36f0d173cd0b0..65e6b7c2f0fba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
 import org.apache.spark.sql.execution.joins.HashedRelation
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
-import org.apache.spark.util.{SparkFatalException, ThreadUtils}
+import org.apache.spark.util.{SparkFatalException, ThreadUtils, Utils}
 
 /**
  * A [[BroadcastExchangeExec]] collects, transforms and finally broadcasts the result of
@@ -73,13 +73,8 @@ case class BroadcastExchangeExec(
 
   @transient
   private[sql] lazy val relationFuture: Future[broadcast.Broadcast[Any]] = {
-    // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here.
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    val task = new Callable[broadcast.Broadcast[Any]]() {
-      override def call(): broadcast.Broadcast[Any] = {
-        // This will run in another thread. Set the execution id so that we can connect these jobs
-        // with the correct execution.
-        SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
+    SQLExecution.withThreadLocalCaptured[broadcast.Broadcast[Any]](
+      sqlContext.sparkSession, BroadcastExchangeExec.executionContext) {
           try {
             // Setup a job group here so later it may get cancelled by groupId if necessary.
             sparkContext.setJobGroup(runId.toString, s"broadcast exchange (runId $runId)",
@@ -121,7 +116,7 @@ case class BroadcastExchangeExec(
             val broadcasted = sparkContext.broadcast(relation)
             longMetric("broadcastTime") += NANOSECONDS.toMillis(
               System.nanoTime() - beforeBroadcast)
-
+            val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
             SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
             promise.success(broadcasted)
             broadcasted
@@ -146,10 +141,7 @@ case class BroadcastExchangeExec(
               promise.failure(e)
               throw e
           }
-        }
-      }
     }
-    BroadcastExchangeExec.executionContext.submit[broadcast.Broadcast[Any]](task)
   }
 
   override protected def doPrepare(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
index 46d0c64592a00..888772c35d0ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
@@ -159,6 +159,34 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-22590 propagate local properties to broadcast execution thread") {
+    withSQLConf(StaticSQLConf.BROADCAST_EXCHANGE_MAX_THREAD_THRESHOLD.key -> "1") {
+      val df1 = Seq(true).toDF()
+      val confKey = "spark.sql.y"
+      val confValue1 = UUID.randomUUID().toString()
+      val confValue2 = UUID.randomUUID().toString()
+
+      def generateBroadcastDataFrame(confKey: String, confValue: String): Dataset[Boolean] = {
+        val df = spark.range(1).mapPartitions { _ =>
+          Iterator(TaskContext.get.getLocalProperty(confKey) == confValue)
+        }
+        df.hint("broadcast")
+      }
+
+      // set local propert and assert
+      val df2 = generateBroadcastDataFrame(confKey, confValue1)
+      spark.sparkContext.setLocalProperty(confKey, confValue1)
+      val checks = df1.join(df2).collect()
+      assert(checks.forall(_.toSeq == Seq(true, true)))
+
+      // change local property and re-assert
+      val df3 = generateBroadcastDataFrame(confKey, confValue2)
+      spark.sparkContext.setLocalProperty(confKey, confValue2)
+      val checks2 = df1.join(df3).collect()
+      assert(checks2.forall(_.toSeq == Seq(true, true)))
+    }
+  }
 }
 
 case class SQLConfAssertPlan(confToCheck: Seq[(String, String)]) extends LeafExecNode {

From 657d151395ca996f8ec0eed695a873abbd63d760 Mon Sep 17 00:00:00 2001
From: Ajith <ajith2489@gmail.com>
Date: Tue, 18 Feb 2020 09:42:31 +0900
Subject: [PATCH 078/185] [SPARK-29174][SQL] Support LOCAL in INSERT OVERWRITE
 DIRECTORY to data source

### What changes were proposed in this pull request?
`INSERT OVERWRITE LOCAL DIRECTORY` is supported with ensuring the provided path is always using `file://` as scheme and removing the check which throws exception if we do insert overwrite by mentioning directory with `LOCAL` syntax

### Why are the changes needed?
without the modification in PR, ``` insert overwrite local directory <location> using ```

throws exception

```
Error: org.apache.spark.sql.catalyst.parser.ParseException:

LOCAL is not supported in INSERT OVERWRITE DIRECTORY to data source(line 1, pos 0)
```
which was introduced in https://github.com/apache/spark/pull/18975, but this restriction is not needed, hence dropping the same.
Keep behaviour consistent for local and remote file-system in  `INSERT OVERWRITE DIRECTORY`

### Does this PR introduce any user-facing change?
Yes, after this change `INSERT OVERWRITE LOCAL DIRECTORY` will not throw exception

### How was this patch tested?
Added UT

Closes #27039 from ajithme/insertoverwrite2.

Authored-by: Ajith <ajith2489@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/sql/execution/SparkSqlParser.scala  | 21 ++++++++++++-----
 .../spark/sql/sources/InsertSuite.scala       | 23 +++++++++++++++++++
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index aa139cb6b0c3b..078813b7d631d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import java.util.Locale
+import javax.ws.rs.core.UriBuilder
 
 import scala.collection.JavaConverters._
 
@@ -753,7 +754,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    *
    * Expected format:
    * {{{
-   *   INSERT OVERWRITE DIRECTORY
+   *   INSERT OVERWRITE [LOCAL] DIRECTORY
    *   [path]
    *   [OPTIONS table_property_list]
    *   select_statement;
@@ -761,11 +762,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    */
   override def visitInsertOverwriteDir(
       ctx: InsertOverwriteDirContext): InsertDirParams = withOrigin(ctx) {
-    if (ctx.LOCAL != null) {
-      throw new ParseException(
-        "LOCAL is not supported in INSERT OVERWRITE DIRECTORY to data source", ctx)
-    }
-
     val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty)
     var storage = DataSource.buildStorageFormatFromOptions(options)
 
@@ -781,6 +777,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       storage = storage.copy(locationUri = customLocation)
     }
 
+    if (ctx.LOCAL() != null) {
+      // assert if directory is local when LOCAL keyword is mentioned
+      val scheme = Option(storage.locationUri.get.getScheme)
+      scheme match {
+        case None =>
+          // force scheme to be file rather than fs.default.name
+          val loc = Some(UriBuilder.fromUri(CatalogUtils.stringToURI(path)).scheme("file").build())
+          storage = storage.copy(locationUri = loc)
+        case Some(pathScheme) if (!pathScheme.equals("file")) =>
+          throw new ParseException("LOCAL is supported only with file: scheme", ctx)
+      }
+    }
+
     val provider = ctx.tableProvider.multipartIdentifier.getText
 
     (false, storage, Some(provider))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index bcff30a51c3f5..0101803561c90 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.test.SharedSparkSession
@@ -820,6 +821,28 @@ class InsertSuite extends DataSourceTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-29174 Support LOCAL in INSERT OVERWRITE DIRECTORY to data source") {
+    withTempPath { dir =>
+      val path = dir.toURI.getPath
+      sql(s"""create table tab1 ( a int) location '$path'""")
+      sql("insert into tab1 values(1)")
+      checkAnswer(sql("select * from tab1"), Seq(1).map(i => Row(i)))
+      sql("create table tab2 ( a int)")
+      sql("insert into tab2 values(2)")
+      checkAnswer(sql("select * from tab2"), Seq(2).map(i => Row(i)))
+      sql(s"""insert overwrite local directory '$path' using parquet select * from tab2""")
+      sql("refresh table tab1")
+      checkAnswer(sql("select * from tab1"), Seq(2).map(i => Row(i)))
+      }
+  }
+
+  test("SPARK-29174 fail LOCAL in INSERT OVERWRITE DIRECT remote path") {
+    val message = intercept[ParseException] {
+      sql("insert overwrite local directory 'hdfs:/abcd' using parquet select 1")
+    }.getMessage
+    assert(message.contains("LOCAL is supported only with file: scheme"))
+  }
 }
 
 class FileExistingTestFileSystem extends RawLocalFileSystem {

From d8d3ce5c76732a8e3daf10a37621b5ec8a52dcec Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Tue, 18 Feb 2020 09:44:34 +0900
Subject: [PATCH 079/185] [SPARK-30825][SQL][DOC] Update documents information
 for window function

### What changes were proposed in this pull request?
I checked the all the window function and found all of them not add parameter information and version information to the document.
This PR will make a supplement.

### Why are the changes needed?
Documentation is missing and does not meet new standards.

### Does this PR introduce any user-facing change?
Yes. User will face the information of parameters and version.

### How was this patch tested?
Exists UT

Closes #27572 from beliefer/add_since_for_window_function.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../expressions/windowExpressions.scala       | 82 ++++++++++++-------
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 74df1e7b8e8c3..9070a1ab0b059 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -410,10 +410,6 @@ abstract class OffsetWindowFunction
  * the window. Offsets start at 0, which is the current row. The offset must be constant
  * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
  * null is returned. If there is no such offset row, the `default` expression is evaluated.
- *
- * @param input expression to evaluate `offset` rows after the current row.
- * @param offset rows to jump ahead in the partition.
- * @param default to use when the offset is larger than the window. The default value is null.
  */
 @ExpressionDescription(
   usage = """
@@ -422,7 +418,15 @@ abstract class OffsetWindowFunction
       value of `default` is null. If the value of `input` at the `offset`th row is null,
       null is returned. If there is no such an offset row (e.g., when the offset is 1, the last
       row of the window does not have any subsequent row), `default` is returned.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * input - a string expression to evaluate `offset` rows after the current row.
+      * offset - an int expression which is rows to jump ahead in the partition.
+      * default - a string expression which is to use when the offset is larger than the window.
+          The default value is null.
+  """,
+  since = "2.0.0")
 case class Lead(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -440,10 +444,6 @@ case class Lead(input: Expression, offset: Expression, default: Expression)
  * the window. Offsets start at 0, which is the current row. The offset must be constant
  * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
  * null is returned. If there is no such offset row, the `default` expression is evaluated.
- *
- * @param input expression to evaluate `offset` rows before the current row.
- * @param offset rows to jump back in the partition.
- * @param default to use when the offset row does not exist.
  */
 @ExpressionDescription(
   usage = """
@@ -452,7 +452,14 @@ case class Lead(input: Expression, offset: Expression, default: Expression)
       value of `default` is null. If the value of `input` at the `offset`th row is null,
       null is returned. If there is no such offset row (e.g., when the offset is 1, the first
       row of the window does not have any previous row), `default` is returned.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * input - a string expression to evaluate `offset` rows before the current row.
+      * offset - an int expression which is rows to jump back in the partition.
+      * default - a string expression which is to use when the offset row does not exist.
+  """,
+  since = "2.0.0")
 case class Lag(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -509,7 +516,8 @@ object SizeBasedWindowFunction {
   usage = """
     _FUNC_() - Assigns a unique, sequential number to each row, starting with one,
       according to the ordering of rows within the window partition.
-  """)
+  """,
+  since = "2.0.0")
 case class RowNumber() extends RowNumberLike {
   override val evaluateExpression = rowNumber
   override def prettyName: String = "row_number"
@@ -526,7 +534,8 @@ case class RowNumber() extends RowNumberLike {
 @ExpressionDescription(
   usage = """
     _FUNC_() - Computes the position of a value relative to all values in the partition.
-  """)
+  """,
+  since = "2.0.0")
 case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
   override def dataType: DataType = DoubleType
   // The frame for CUME_DIST is Range based instead of Row based, because CUME_DIST must
@@ -554,14 +563,18 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
  * threshold is increased by the bucket size (plus one extra if the current bucket is padded).
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
- *
- * @param buckets number of buckets to divide the rows in. Default value is 1.
  */
 @ExpressionDescription(
   usage = """
     _FUNC_(n) - Divides the rows for each window partition into `n` buckets ranging
       from 1 to at most `n`.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * buckets - an int expression which is number of buckets to divide the rows in.
+          Default value is 1.
+  """,
+  since = "2.0.0")
 case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindowFunction {
   def this() = this(Literal(1))
 
@@ -674,17 +687,20 @@ abstract class RankLike extends AggregateWindowFunction {
  * will produce gaps in the sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
- *
- * @param children to base the rank on; a change in the value of one the children will trigger a
- *                 change in rank. This is an internal parameter and will be assigned by the
- *                 Analyser.
  */
 @ExpressionDescription(
   usage = """
     _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the number
       of rows preceding or equal to the current row in the ordering of the partition. The values
       will produce gaps in the sequence.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * children - this is to base the rank on; a change in the value of one the children will
+          trigger a change in rank. This is an internal parameter and will be assigned by the
+          Analyser.
+  """,
+  since = "2.0.0")
 case class Rank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): Rank = Rank(order)
@@ -696,17 +712,20 @@ case class Rank(children: Seq[Expression]) extends RankLike {
  * ranking sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
- *
- * @param children to base the rank on; a change in the value of one the children will trigger a
- *                 change in rank. This is an internal parameter and will be assigned by the
- *                 Analyser.
  */
 @ExpressionDescription(
   usage = """
     _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the
       previously assigned rank value. Unlike the function rank, dense_rank will not produce gaps
       in the ranking sequence.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * children - this is to base the rank on; a change in the value of one the children will
+          trigger a change in rank. This is an internal parameter and will be assigned by the
+          Analyser.
+  """,
+  since = "2.0.0")
 case class DenseRank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): DenseRank = DenseRank(order)
@@ -726,15 +745,18 @@ case class DenseRank(children: Seq[Expression]) extends RankLike {
  * row counts in the its numerator.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
- *
- * @param children to base the rank on; a change in the value of one of the children will trigger a
- *                 change in rank. This is an internal parameter and will be assigned by the
- *                 Analyser.
  */
 @ExpressionDescription(
   usage = """
     _FUNC_() - Computes the percentage ranking of a value in a group of values.
-  """)
+  """,
+  arguments = """
+    Arguments:
+      * children - this is to base the rank on; a change in the value of one the children will
+          trigger a change in rank. This is an internal parameter and will be assigned by the
+          Analyser.
+  """,
+  since = "2.0.0")
 case class PercentRank(children: Seq[Expression]) extends RankLike with SizeBasedWindowFunction {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): PercentRank = PercentRank(order)

From 4ed9b88996dc0e8f096157659d537d9dd8eaed47 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 17 Feb 2020 19:19:28 -0600
Subject: [PATCH 080/185] [SPARK-30832][DOCS] SQL function doc headers should
 link to anchors

### Why are the changes needed?

In most of our docs, you can click on a heading to immediately get an anchor link to that specific section of the docs. This is very handy when you are reading the docs and want to share a link to a specific part.

The SQL function docs are lacking this. This PR adds this convenience to the SQL function docs.

Here's the impact on the generated HTML.

Before this PR:

```html
<h3 id="array_join">array_join</h3>
```

After this PR:

```html
<h3 id="array_join"><a class="toclink" href="#array_join">array_join</a></h3>
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

I built the docs manually and reviewed the results in my browser.

Closes #27585 from nchammas/SPARK-30832-sql-doc-headers.

Authored-by: Nicholas Chammas <nicholas.chammas@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 sql/mkdocs.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml
index c34c891bb9e42..8aad985e30459 100644
--- a/sql/mkdocs.yml
+++ b/sql/mkdocs.yml
@@ -17,3 +17,6 @@ site_name: Spark SQL, Built-in Functions
 theme: readthedocs
 pages:
   - 'Functions': 'index.md'
+markdown_extensions:
+  - toc:
+      anchorlink: True

From d8c0599e542976ef70b60bc673e7c9732fce49e5 Mon Sep 17 00:00:00 2001
From: Liang Zhang <liang.zhang@databricks.com>
Date: Tue, 18 Feb 2020 09:22:26 +0800
Subject: [PATCH 081/185] [SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and
 'sementicHash' methods in Dataset

### What changes were proposed in this pull request?
This PR added two DeveloperApis to the Dataset[T] class. Both methods are just exposing lower-level methods to the Dataset[T] class.

### Why are the changes needed?
They are useful for checking whether two dataframes are the same when implementing dataframe caching in python, and also get a unique ID. It's easier to use if we wrap the lower-level APIs.

### Does this PR introduce any user-facing change?
```
scala> val df1 = Seq((1,2),(4,5)).toDF("col1", "col2")
df1: org.apache.spark.sql.DataFrame = [col1: int, col2: int]

scala> val df2 = Seq((1,2),(4,5)).toDF("col1", "col2")
df2: org.apache.spark.sql.DataFrame = [col1: int, col2: int]

scala> val df3 = Seq((0,2),(4,5)).toDF("col1", "col2")
df3: org.apache.spark.sql.DataFrame = [col1: int, col2: int]

scala> val df4 = Seq((0,2),(4,5)).toDF("col0", "col2")
df4: org.apache.spark.sql.DataFrame = [col0: int, col2: int]

scala> df1.semanticHash
res0: Int = 594427822

scala> df2.semanticHash
res1: Int = 594427822

scala> df1.sameSemantics(df2)
res2: Boolean = true

scala> df1.sameSemantics(df3)
res3: Boolean = false

scala> df3.semanticHash
res4: Int = -1592702048

scala> df4.semanticHash
res5: Int = -1592702048

scala> df4.sameSemantics(df3)
res6: Boolean = true
```

### How was this patch tested?
Unit test in scala and doctest in python.

Note: comments are copied from the corresponding lower-level APIs.
Note: There are some issues to be fixed that would improve the hash collision rate: https://github.com/apache/spark/pull/27565#discussion_r379881028

Closes #27565 from liangz1/df-same-result.

Authored-by: Liang Zhang <liang.zhang@databricks.com>
Signed-off-by: WeichenXu <weichen.xu@databricks.com>
---
 python/pyspark/sql/dataframe.py               | 46 +++++++++++++++++++
 python/pyspark/sql/tests/test_dataframe.py    |  5 ++
 .../scala/org/apache/spark/sql/Dataset.scala  | 28 +++++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 15 ++++++
 4 files changed, 94 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2432b8127840b..8325b6814f182 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2153,6 +2153,52 @@ def transform(self, func):
                                               "should have been DataFrame." % type(result)
         return result
 
+    @since(3.1)
+    def sameSemantics(self, other):
+        """
+        Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and
+        therefore return same results.
+
+        .. note:: The equality comparison here is simplified by tolerating the cosmetic differences
+            such as attribute names.
+
+        .. note:: This API can compare both :class:`DataFrame`\\s very fast but can still return
+            `False` on the :class:`DataFrame` that return the same results, for instance, from
+            different plans. Such false negative semantic can be useful when caching as an example.
+
+        .. note:: DeveloperApi
+
+        >>> df1 = spark.range(10)
+        >>> df2 = spark.range(10)
+        >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id * 2))
+        True
+        >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id + 2))
+        False
+        >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col0", df2.id * 2))
+        True
+        """
+        if not isinstance(other, DataFrame):
+            raise ValueError("other parameter should be of DataFrame; however, got %s"
+                             % type(other))
+        return self._jdf.sameSemantics(other._jdf)
+
+    @since(3.1)
+    def semanticHash(self):
+        """
+        Returns a hash code of the logical query plan against this :class:`DataFrame`.
+
+        .. note:: Unlike the standard hash code, the hash is calculated against the query plan
+            simplified by tolerating the cosmetic differences such as attribute names.
+
+        .. note:: DeveloperApi
+
+        >>> spark.range(10).selectExpr("id as col0").semanticHash()  # doctest: +SKIP
+        1855039936
+        >>> spark.range(10).selectExpr("id as col1").semanticHash()  # doctest: +SKIP
+        1855039936
+        """
+        return self._jdf.semanticHash()
+
     where = copy_func(
         filter,
         sinceversion=1.3,
diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
index d738449799bda..942cd4b4b0ea3 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -782,6 +782,11 @@ def test_to_local_iterator_not_fully_consumed(self):
                     break
             self.assertEqual(df.take(8), result)
 
+    def test_same_semantics_error(self):
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(ValueError, "should be of DataFrame.*int"):
+                self.spark.range(10).sameSemantics(1)
+
 
 class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
     # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 42f35354e864f..5cd258341c183 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3310,6 +3310,34 @@ class Dataset[T] private[sql](
     files.toSet.toArray
   }
 
+  /**
+   * Returns `true` when the logical query plans inside both [[Dataset]]s are equal and
+   * therefore return same results.
+   *
+   * @note The equality comparison here is simplified by tolerating the cosmetic differences
+   *       such as attribute names.
+   * @note This API can compare both [[Dataset]]s very fast but can still return `false` on
+   *       the [[Dataset]] that return the same results, for instance, from different plans. Such
+   *       false negative semantic can be useful when caching as an example.
+   * @since 3.1.0
+   */
+  @DeveloperApi
+  def sameSemantics(other: Dataset[T]): Boolean = {
+    queryExecution.analyzed.sameResult(other.queryExecution.analyzed)
+  }
+
+  /**
+   * Returns a `hashCode` of the logical query plan against this [[Dataset]].
+   *
+   * @note Unlike the standard `hashCode`, the hash is calculated against the query plan
+   *       simplified by tolerating the cosmetic differences such as attribute names.
+   * @since 3.1.0
+   */
+  @DeveloperApi
+  def semanticHash(): Int = {
+    queryExecution.analyzed.semanticHash()
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   // For Python API
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index b0bd612e88d98..b4ed4ec28ceb0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1909,6 +1909,21 @@ class DatasetSuite extends QueryTest
 
     assert(active eq SparkSession.getActiveSession.get)
   }
+
+  test("SPARK-30791: sameSemantics and semanticHash work") {
+    val df1 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
+    val df2 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
+    val df3 = Seq((0, 2), (4, 5)).toDF("col1", "col2")
+    val df4 = Seq((0, 2), (4, 5)).toDF("col0", "col2")
+
+    assert(df1.sameSemantics(df2) === true)
+    assert(df1.sameSemantics(df3) === false)
+    assert(df3.sameSemantics(df4) === true)
+
+    assert(df1.semanticHash === df2.semanticHash)
+    assert(df1.semanticHash !== df3.semanticHash)
+    assert(df3.semanticHash === df4.semanticHash)
+  }
 }
 
 object AssertExecutionId {

From 5866bc77d7703939e93c00f22ea32981d4ebdc6c Mon Sep 17 00:00:00 2001
From: Terry Kim <yuminkim@gmail.com>
Date: Tue, 18 Feb 2020 13:01:45 +0800
Subject: [PATCH 082/185] [SPARK-30814][SQL] ALTER TABLE ... ADD COLUMN
 position should be able to reference columns being added

### What changes were proposed in this pull request?

In ALTER TABLE, a column in ADD COLUMNS can depend on the position of a column that is just being added. For example, for a table with the following schema:
```
root:
  - a: string
  - b: long
```
, the following should work:
```
ALTER TABLE t ADD COLUMNS (x int AFTER a, y int AFTER x)
```
Currently, the above statement will throw an exception saying that AFTER x cannot be resolved, because x doesn't exist yet. This PR proposes to fix this issue.

### Why are the changes needed?

To fix a bug described above.

### Does this PR introduce any user-facing change?

Yes, now
```
ALTER TABLE t ADD COLUMNS (x int AFTER a, y int AFTER x)
```
works as expected.

### How was this patch tested?

Added new tests

Closes #27584 from imback82/alter_table_pos_fix.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 46 +++++++++++--------
 .../sql/catalyst/analysis/CheckAnalysis.scala | 26 +++++++++--
 .../spark/sql/connector/AlterTableTests.scala | 36 +++++++++++++++
 .../V2CommandsCaseSensitivitySuite.scala      | 28 ++++++++++-
 4 files changed, 111 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index ce82b3b567b54..aec71747fde68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3023,9 +3023,29 @@ class Analyzer(
   object ResolveAlterTableChanges extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
       case a @ AlterTable(_, _, t: NamedRelation, changes) if t.resolved =>
+        // 'colsToAdd' keeps track of new columns being added. It stores a mapping from a
+        // normalized parent name of fields to field names that belong to the parent.
+        // For example, if we add columns "a.b.c", "a.b.d", and "a.c", 'colsToAdd' will become
+        // Map(Seq("a", "b") -> Seq("c", "d"), Seq("a") -> Seq("c")).
+        val colsToAdd = mutable.Map.empty[Seq[String], Seq[String]]
         val schema = t.schema
         val normalizedChanges = changes.flatMap {
           case add: AddColumn =>
+            def addColumn(
+                parentSchema: StructType,
+                parentName: String,
+                normalizedParentName: Seq[String]): TableChange = {
+              val fieldsAdded = colsToAdd.getOrElse(normalizedParentName, Nil)
+              val pos = findColumnPosition(add.position(), parentName, parentSchema, fieldsAdded)
+              val field = add.fieldNames().last
+              colsToAdd(normalizedParentName) = fieldsAdded :+ field
+              TableChange.addColumn(
+                (normalizedParentName :+ field).toArray,
+                add.dataType(),
+                add.isNullable,
+                add.comment,
+                pos)
+            }
             val parent = add.fieldNames().init
             if (parent.nonEmpty) {
               // Adding a nested field, need to normalize the parent column and position
@@ -3037,27 +3057,14 @@ class Analyzer(
                 val (normalizedName, sf) = target.get
                 sf.dataType match {
                   case struct: StructType =>
-                    val pos = findColumnPosition(add.position(), parent.quoted, struct)
-                    Some(TableChange.addColumn(
-                      (normalizedName ++ Seq(sf.name, add.fieldNames().last)).toArray,
-                      add.dataType(),
-                      add.isNullable,
-                      add.comment,
-                      pos))
-
+                    Some(addColumn(struct, parent.quoted, normalizedName :+ sf.name))
                   case other =>
                     Some(add)
                 }
               }
             } else {
               // Adding to the root. Just need to normalize position
-              val pos = findColumnPosition(add.position(), "root", schema)
-              Some(TableChange.addColumn(
-                add.fieldNames(),
-                add.dataType(),
-                add.isNullable,
-                add.comment,
-                pos))
+              Some(addColumn(schema, "root", Nil))
             }
 
           case typeChange: UpdateColumnType =>
@@ -3156,17 +3163,18 @@ class Analyzer(
 
     private def findColumnPosition(
         position: ColumnPosition,
-        field: String,
-        struct: StructType): ColumnPosition = {
+        parentName: String,
+        struct: StructType,
+        fieldsAdded: Seq[String]): ColumnPosition = {
       position match {
         case null => null
         case after: After =>
-          struct.fieldNames.find(n => conf.resolver(n, after.column())) match {
+          (struct.fieldNames ++ fieldsAdded).find(n => conf.resolver(n, after.column())) match {
             case Some(colName) =>
               ColumnPosition.after(colName)
             case None =>
               throw new AnalysisException("Couldn't find the reference column for " +
-                s"$after at $field")
+                s"$after at $parentName")
           }
         case other => other
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 67c509ed98245..066dc6db0227d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -440,12 +440,16 @@ trait CheckAnalysis extends PredicateHelper {
               }
               field.get._2
             }
-            def positionArgumentExists(position: ColumnPosition, struct: StructType): Unit = {
+            def positionArgumentExists(
+                position: ColumnPosition,
+                struct: StructType,
+                fieldsAdded: Seq[String]): Unit = {
               position match {
                 case after: After =>
-                  if (!struct.fieldNames.contains(after.column())) {
+                  val allFields = struct.fieldNames ++ fieldsAdded
+                  if (!allFields.contains(after.column())) {
                     alter.failAnalysis(s"Couldn't resolve positional argument $position amongst " +
-                      s"${struct.fieldNames.mkString("[", ", ", "]")}")
+                      s"${allFields.mkString("[", ", ", "]")}")
                   }
                 case _ =>
               }
@@ -474,6 +478,11 @@ trait CheckAnalysis extends PredicateHelper {
             }
 
             val colsToDelete = mutable.Set.empty[Seq[String]]
+            // 'colsToAdd' keeps track of new columns being added. It stores a mapping from a parent
+            // name of fields to field names that belong to the parent. For example, if we add
+            // columns "a.b.c", "a.b.d", and "a.c", 'colsToAdd' will become
+            // Map(Seq("a", "b") -> Seq("c", "d"), Seq("a") -> Seq("c")).
+            val colsToAdd = mutable.Map.empty[Seq[String], Seq[String]]
 
             alter.changes.foreach {
               case add: AddColumn =>
@@ -483,8 +492,11 @@ trait CheckAnalysis extends PredicateHelper {
                   checkColumnNotExists("add", add.fieldNames(), table.schema)
                 }
                 val parent = findParentStruct("add", add.fieldNames())
-                positionArgumentExists(add.position(), parent)
+                val parentName = add.fieldNames().init
+                val fieldsAdded = colsToAdd.getOrElse(parentName, Nil)
+                positionArgumentExists(add.position(), parent, fieldsAdded)
                 TypeUtils.failWithIntervalType(add.dataType())
+                colsToAdd(parentName) = fieldsAdded :+ add.fieldNames().last
               case update: UpdateColumnType =>
                 val field = findField("update", update.fieldNames)
                 val fieldName = update.fieldNames.quoted
@@ -523,7 +535,11 @@ trait CheckAnalysis extends PredicateHelper {
               case updatePos: UpdateColumnPosition =>
                 findField("update", updatePos.fieldNames)
                 val parent = findParentStruct("update", updatePos.fieldNames())
-                positionArgumentExists(updatePos.position(), parent)
+                val parentName = updatePos.fieldNames().init
+                positionArgumentExists(
+                  updatePos.position(),
+                  parent,
+                  colsToAdd.getOrElse(parentName, Nil))
               case rename: RenameColumn =>
                 findField("rename", rename.fieldNames)
                 checkColumnNotExists(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
index 96fe301b512ea..d04a1fca6387c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
@@ -173,6 +173,42 @@ trait AlterTableTests extends SharedSparkSession {
     }
   }
 
+  test("SPARK-30814: add column with position referencing new columns being added") {
+    val t = s"${catalogAndNamespace}table_name"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (a string, b int, point struct<x: double, y: double>) USING $v2Format")
+      sql(s"ALTER TABLE $t ADD COLUMNS (x int AFTER a, y int AFTER x, z int AFTER y)")
+
+      assert(getTableMetadata(t).schema === new StructType()
+        .add("a", StringType)
+        .add("x", IntegerType)
+        .add("y", IntegerType)
+        .add("z", IntegerType)
+        .add("b", IntegerType)
+        .add("point", new StructType()
+          .add("x", DoubleType)
+          .add("y", DoubleType)))
+
+      sql(s"ALTER TABLE $t ADD COLUMNS (point.z double AFTER x, point.zz double AFTER z)")
+      assert(getTableMetadata(t).schema === new StructType()
+        .add("a", StringType)
+        .add("x", IntegerType)
+        .add("y", IntegerType)
+        .add("z", IntegerType)
+        .add("b", IntegerType)
+        .add("point", new StructType()
+          .add("x", DoubleType)
+          .add("z", DoubleType)
+          .add("zz", DoubleType)
+          .add("y", DoubleType)))
+
+      // The new column being referenced should come before being referenced.
+      val e = intercept[AnalysisException](
+        sql(s"ALTER TABLE $t ADD COLUMNS (yy int AFTER xx, xx int)"))
+      assert(e.getMessage().contains("Couldn't find the reference column for AFTER xx at root"))
+    }
+  }
+
   test("AlterTable: add multiple columns") {
     val t = s"${catalogAndNamespace}table_name"
     withTable(t) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala
index 289f9dc427795..dd95ceb59bdc4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala
@@ -151,6 +151,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes
     }
   }
 
+  test("AlterTable: add column resolution - column position referencing new column") {
+    alterTableTest(
+      Seq(
+        TableChange.addColumn(
+          Array("x"), LongType, true, null, ColumnPosition.after("id")),
+        TableChange.addColumn(
+          Array("y"), LongType, true, null, ColumnPosition.after("X"))),
+      Seq("Couldn't find the reference column for AFTER X at root")
+    )
+  }
+
   test("AlterTable: add column resolution - nested positional") {
     Seq("X", "Y").foreach { ref =>
       alterTableTest(
@@ -161,6 +172,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes
     }
   }
 
+  test("AlterTable: add column resolution - column position referencing new nested column") {
+    alterTableTest(
+      Seq(
+        TableChange.addColumn(
+          Array("point", "z"), LongType, true, null),
+        TableChange.addColumn(
+          Array("point", "zz"), LongType, true, null, ColumnPosition.after("Z"))),
+      Seq("Couldn't find the reference column for AFTER Z at point")
+    )
+  }
+
   test("AlterTable: drop column resolution") {
     Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref =>
       alterTableTest(
@@ -207,13 +229,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes
   }
 
   private def alterTableTest(change: TableChange, error: Seq[String]): Unit = {
+    alterTableTest(Seq(change), error)
+  }
+
+  private def alterTableTest(changes: Seq[TableChange], error: Seq[String]): Unit = {
     Seq(true, false).foreach { caseSensitive =>
       withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
         val plan = AlterTable(
           catalog,
           Identifier.of(Array(), "table_name"),
           TestRelation2,
-          Seq(change)
+          changes
         )
 
         if (caseSensitive) {

From 9618806f44196aa2a69061794b34c31d264a4b3c Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Tue, 18 Feb 2020 14:28:44 +0800
Subject: [PATCH 083/185] [SPARK-30847][SQL] Take productPrefix into account in
 MurmurHash3.productHash

### What changes were proposed in this pull request?

This PR proposes to port Scala's bugfix https://github.com/scala/scala/pull/7693 (Scala 2.13) to address https://github.com/scala/bug/issues/10495 issue.

In short, it is possible for different product instances having the same children to have the same hash. See:

```scala
scala> spark.range(1).selectExpr("id - 1").queryExecution.analyzed.semanticHash()
res0: Int = -565572825

scala> spark.range(1).selectExpr("id + 1").queryExecution.analyzed.semanticHash()
res1: Int = -565572825
```

### Why are the changes needed?

It was found during the review of https://github.com/apache/spark/pull/27565. We should better produce different hash for different objects.

### Does this PR introduce any user-facing change?

No, it's not identified. Possibly performance related issue.

### How was this patch tested?

Manually tested, and unittest was added.

Closes #27601 from HyukjinKwon/SPARK-30847.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/trees/TreeNode.scala   | 25 ++++++++++++++++++-
 .../expressions/CanonicalizeSuite.scala       |  7 ++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 56a198763b4e5..c4a106702a515 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -114,7 +114,30 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
 
   lazy val containsChild: Set[TreeNode[_]] = children.toSet
 
-  private lazy val _hashCode: Int = scala.util.hashing.MurmurHash3.productHash(this)
+  // Copied from Scala 2.13.1
+  // github.com/scala/scala/blob/v2.13.1/src/library/scala/util/hashing/MurmurHash3.scala#L56-L73
+  // to prevent the issue https://github.com/scala/bug/issues/10495
+  // TODO(SPARK-30848): Remove this once we drop Scala 2.12.
+  private final def productHash(x: Product, seed: Int, ignorePrefix: Boolean = false): Int = {
+    val arr = x.productArity
+    // Case objects have the hashCode inlined directly into the
+    // synthetic hashCode method, but this method should still give
+    // a correct result if passed a case object.
+    if (arr == 0) {
+      x.productPrefix.hashCode
+    } else {
+      var h = seed
+      if (!ignorePrefix) h = scala.util.hashing.MurmurHash3.mix(h, x.productPrefix.hashCode)
+      var i = 0
+      while (i < arr) {
+        h = scala.util.hashing.MurmurHash3.mix(h, x.productElement(i).##)
+        i += 1
+      }
+      scala.util.hashing.MurmurHash3.finalizeHash(h, arr)
+    }
+  }
+
+  private lazy val _hashCode: Int = productHash(this, scala.util.hashing.MurmurHash3.productSeed)
   override def hashCode(): Int = _hashCode
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
index 9802a6e5891b8..dd719437d618d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
@@ -79,4 +79,11 @@ class CanonicalizeSuite extends SparkFunSuite {
       0, Some("b2"))
     assert(fieldB1.semanticEquals(fieldB2))
   }
+
+  test("SPARK-30847: Take productPrefix into account in MurmurHash3.productHash") {
+    val range = Range(1, 1, 1, 1)
+    val addExpr = Add(range.output.head, Literal(1))
+    val subExpr = Subtract(range.output.head, Literal(1))
+    assert(addExpr.canonicalized.hashCode() != subExpr.canonicalized.hashCode())
+  }
 }

From 504b5135d09751c15ba1f515e82d3bb6fe87ea40 Mon Sep 17 00:00:00 2001
From: David Toneian <david@toneian.com>
Date: Tue, 18 Feb 2020 16:46:45 +0900
Subject: [PATCH 084/185] [SPARK-30859][PYSPARK][DOCS][MINOR] Fixed docstring
 syntax issues preventing proper compilation of documentation

This commit is published into the public domain.

### What changes were proposed in this pull request?
Some syntax issues in docstrings have been fixed.

### Why are the changes needed?
In some places, the documentation did not render as intended, e.g. parameter documentations were not formatted as such.

### Does this PR introduce any user-facing change?
Slight improvements in documentation.

### How was this patch tested?
Manual testing. No new Sphinx warnings arise due to this change.

Closes #27613 from DavidToneian/SPARK-30859.

Authored-by: David Toneian <david@toneian.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/ml/functions.py   | 1 +
 python/pyspark/ml/util.py        | 2 ++
 python/pyspark/sql/column.py     | 3 ++-
 python/pyspark/sql/functions.py  | 3 +++
 python/pyspark/sql/readwriter.py | 2 +-
 python/pyspark/sql/streaming.py  | 4 +++-
 6 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py
index ec164f34bc4db..65b0558b282a4 100644
--- a/python/pyspark/ml/functions.py
+++ b/python/pyspark/ml/functions.py
@@ -23,6 +23,7 @@
 def vector_to_array(col, dtype="float64"):
     """
     Converts a column of MLlib sparse/dense vectors into a column of dense arrays.
+
     :param col: A string of the column name or a Column
     :param dtype: The data type of the output array. Valid values: "float64" or "float32".
     :return: The converted column of dense arrays.
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 81b785e71f302..fe0ed569f214d 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -390,6 +390,7 @@ def saveImpl(self, path):
     def saveMetadata(instance, path, sc, extraMetadata=None, paramMap=None):
         """
         Saves metadata + Params to: path + "/metadata"
+
         - class
         - timestamp
         - sparkVersion
@@ -397,6 +398,7 @@ def saveMetadata(instance, path, sc, extraMetadata=None, paramMap=None):
         - paramMap
         - defaultParamMap (since 2.4.0)
         - (optionally, extra metadata)
+
         :param extraMetadata:  Extra metadata to be saved at same level as uid, paramMap, etc.
         :param paramMap:  If given, this is saved in the "paramMap" field.
         """
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 59d1408e26ad5..9b728b393452f 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -540,7 +540,8 @@ def alias(self, *alias, **kwargs):
 
         :param alias: strings of desired column names (collects all positional arguments passed)
         :param metadata: a dict of information to be stored in ``metadata`` attribute of the
-            corresponding :class: `StructField` (optional, keyword only argument)
+            corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword
+            only argument)
 
         .. versionchanged:: 2.2
            Added optional ``metadata`` argument.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index b7ff7264aa183..5bf8165af6a44 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -236,12 +236,14 @@ def _options_to_str(options):
     'degrees': """
                Converts an angle measured in radians to an approximately equivalent angle
                measured in degrees.
+
                :param col: angle in radians
                :return: angle in degrees, as if computed by `java.lang.Math.toDegrees()`
                """,
     'radians': """
                Converts an angle measured in degrees to an approximately equivalent angle
                measured in radians.
+
                :param col: angle in degrees
                :return: angle in radians, as if computed by `java.lang.Math.toRadians()`
                """,
@@ -2116,6 +2118,7 @@ def array_remove(col, element):
 def array_distinct(col):
     """
     Collection function: removes duplicate values from the array.
+
     :param col: name of column or expression
 
     >>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data'])
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 69660395ad823..2db25875621ff 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -133,7 +133,7 @@ def options(self, **options):
 
     @since(1.4)
     def load(self, path=None, format=None, schema=None, **options):
-        """Loads data from a data source and returns it as a :class`DataFrame`.
+        """Loads data from a data source and returns it as a :class:`DataFrame`.
 
         :param path: optional string or a list of string for file-system backed data sources.
         :param format: optional string for format of the data source. Default to 'parquet'.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 5fced8aca9bdf..f989cb3e133f1 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -125,6 +125,7 @@ def lastProgress(self):
         """
         Returns the most recent :class:`StreamingQueryProgress` update of this streaming query or
         None if there were no progress updates
+
         :return: a map
         """
         lastProgress = self._jsq.lastProgress()
@@ -368,7 +369,8 @@ def options(self, **options):
 
     @since(2.0)
     def load(self, path=None, format=None, schema=None, **options):
-        """Loads a data stream from a data source and returns it as a :class`DataFrame`.
+        """Loads a data stream from a data source and returns it as a
+        :class:`DataFrame <pyspark.sql.DataFrame>`.
 
         .. note:: Evolving.
 

From 643a480b115d19fdc26a1edb463cf896467f890a Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Tue, 18 Feb 2020 16:10:43 +0800
Subject: [PATCH 085/185] [SPARK-30863][SQL] Distinguish Cast and AnsiCast in
 toString

### What changes were proposed in this pull request?

Prefix by `ansi_`  in `toString` if it's a `AnsiCast` or ansi enabled `Cast`.

E.g. run `spark.sql("select cast('51' as int)").queryExecution.analyzed` under ansi mode.

Before this PR:
```
Project [cast(51 as int) AS CAST(51 AS INT)#0]
+- OneRowRelation
```

After this PR:
```
Project [ansi_cast(51 as int) AS CAST(51 AS INT)#0]
+- OneRowRelation
```

### Why are the changes needed?

This is useful while comparing `LogicalPlan`s literally.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass Jenkins.

Closes #27608 from Ngone51/ansi_cast_tostring.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/Cast.scala |   5 +-
 .../postgreSQL/udf-aggregates_part1.sql.out   |  62 +++++------
 .../postgreSQL/udf-aggregates_part2.sql.out   |  12 +-
 .../results/udf/postgreSQL/udf-case.sql.out   |  10 +-
 .../results/udf/postgreSQL/udf-join.sql.out   | 104 +++++++++---------
 .../udf/postgreSQL/udf-select_having.sql.out  |  10 +-
 .../postgreSQL/udf-select_implicit.sql.out    |  44 ++++----
 7 files changed, 125 insertions(+), 122 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 05b4fbef2b697..7c4316fe08433 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -251,7 +251,10 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
 
   def dataType: DataType
 
-  override def toString: String = s"cast($child as ${dataType.simpleString})"
+  override def toString: String = {
+    val ansi = if (ansiEnabled) "ansi_" else ""
+    s"${ansi}cast($child as ${dataType.simpleString})"
+  }
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (Cast.canCast(child.dataType, dataType)) {
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
index d65c56774eafd..adf434b1bde95 100644
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
@@ -77,7 +77,7 @@ struct<max_324_78:float>
 -- !query
 SELECT stddev_pop(udf(b)) FROM aggtest
 -- !query schema
-struct<stddev_pop(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DOUBLE)):double>
+struct<stddev_pop(CAST(CAST(udf(ansi_cast(b as string)) AS FLOAT) AS DOUBLE)):double>
 -- !query output
 131.10703231895047
 
@@ -85,7 +85,7 @@ struct<stddev_pop(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DOUBLE)):double>
 -- !query
 SELECT udf(stddev_samp(b)) FROM aggtest
 -- !query schema
-struct<CAST(udf(cast(stddev_samp(cast(b as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast(stddev_samp(ansi_cast(b as double)) as string)) AS DOUBLE):double>
 -- !query output
 151.38936080399804
 
@@ -93,7 +93,7 @@ struct<CAST(udf(cast(stddev_samp(cast(b as double)) as string)) AS DOUBLE):doubl
 -- !query
 SELECT var_pop(udf(b)) FROM aggtest
 -- !query schema
-struct<var_pop(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DOUBLE)):double>
+struct<var_pop(CAST(CAST(udf(ansi_cast(b as string)) AS FLOAT) AS DOUBLE)):double>
 -- !query output
 17189.053923482323
 
@@ -101,7 +101,7 @@ struct<var_pop(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DOUBLE)):double>
 -- !query
 SELECT udf(var_samp(b)) FROM aggtest
 -- !query schema
-struct<CAST(udf(cast(var_samp(cast(b as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast(var_samp(ansi_cast(b as double)) as string)) AS DOUBLE):double>
 -- !query output
 22918.738564643096
 
@@ -109,7 +109,7 @@ struct<CAST(udf(cast(var_samp(cast(b as double)) as string)) AS DOUBLE):double>
 -- !query
 SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
 -- !query schema
-struct<CAST(udf(cast(stddev_pop(cast(cast(b as decimal(38,0)) as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast(stddev_pop(ansi_cast(ansi_cast(b as decimal(38,0)) as double)) as string)) AS DOUBLE):double>
 -- !query output
 131.18117242958306
 
@@ -117,7 +117,7 @@ struct<CAST(udf(cast(stddev_pop(cast(cast(b as decimal(38,0)) as double)) as str
 -- !query
 SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest
 -- !query schema
-struct<stddev_samp(CAST(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<stddev_samp(CAST(CAST(CAST(udf(ansi_cast(b as string)) AS FLOAT) AS DECIMAL(38,0)) AS DOUBLE)):double>
 -- !query output
 151.47497042966097
 
@@ -125,7 +125,7 @@ struct<stddev_samp(CAST(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DECIMAL(38
 -- !query
 SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
 -- !query schema
-struct<CAST(udf(cast(var_pop(cast(cast(b as decimal(38,0)) as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast(var_pop(ansi_cast(ansi_cast(b as decimal(38,0)) as double)) as string)) AS DOUBLE):double>
 -- !query output
 17208.5
 
@@ -133,7 +133,7 @@ struct<CAST(udf(cast(var_pop(cast(cast(b as decimal(38,0)) as double)) as string
 -- !query
 SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest
 -- !query schema
-struct<var_samp(CAST(CAST(udf(cast(cast(b as decimal(38,0)) as string)) AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<var_samp(CAST(CAST(udf(ansi_cast(ansi_cast(b as decimal(38,0)) as string)) AS DECIMAL(38,0)) AS DOUBLE)):double>
 -- !query output
 22944.666666666668
 
@@ -141,7 +141,7 @@ struct<var_samp(CAST(CAST(udf(cast(cast(b as decimal(38,0)) as string)) AS DECIM
 -- !query
 SELECT udf(var_pop(1.0)), var_samp(udf(2.0))
 -- !query schema
-struct<CAST(udf(cast(var_pop(cast(1.0 as double)) as string)) AS DOUBLE):double,var_samp(CAST(CAST(udf(cast(2.0 as string)) AS DECIMAL(2,1)) AS DOUBLE)):double>
+struct<CAST(udf(ansi_cast(var_pop(ansi_cast(1.0 as double)) as string)) AS DOUBLE):double,var_samp(CAST(CAST(udf(ansi_cast(2.0 as string)) AS DECIMAL(2,1)) AS DOUBLE)):double>
 -- !query output
 0.0	NaN
 
@@ -149,7 +149,7 @@ struct<CAST(udf(cast(var_pop(cast(1.0 as double)) as string)) AS DOUBLE):double,
 -- !query
 SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
 -- !query schema
-struct<stddev_pop(CAST(CAST(udf(cast(cast(3.0 as decimal(38,0)) as string)) AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(CAST(udf(cast(4.0 as string)) AS DECIMAL(2,1)) AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<stddev_pop(CAST(CAST(udf(ansi_cast(ansi_cast(3.0 as decimal(38,0)) as string)) AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(CAST(udf(ansi_cast(4.0 as string)) AS DECIMAL(2,1)) AS DECIMAL(38,0)) AS DOUBLE)):double>
 -- !query output
 0.0	NaN
 
@@ -157,7 +157,7 @@ struct<stddev_pop(CAST(CAST(udf(cast(cast(3.0 as decimal(38,0)) as string)) AS D
 -- !query
 select sum(udf(CAST(null AS int))) from range(1,4)
 -- !query schema
-struct<sum(CAST(udf(cast(cast(null as int) as string)) AS INT)):bigint>
+struct<sum(CAST(udf(ansi_cast(ansi_cast(null as int) as string)) AS INT)):bigint>
 -- !query output
 NULL
 
@@ -165,7 +165,7 @@ NULL
 -- !query
 select sum(udf(CAST(null AS long))) from range(1,4)
 -- !query schema
-struct<sum(CAST(udf(cast(cast(null as bigint) as string)) AS BIGINT)):bigint>
+struct<sum(CAST(udf(ansi_cast(ansi_cast(null as bigint) as string)) AS BIGINT)):bigint>
 -- !query output
 NULL
 
@@ -173,7 +173,7 @@ NULL
 -- !query
 select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4)
 -- !query schema
-struct<sum(CAST(udf(cast(cast(null as decimal(38,0)) as string)) AS DECIMAL(38,0))):decimal(38,0)>
+struct<sum(CAST(udf(ansi_cast(ansi_cast(null as decimal(38,0)) as string)) AS DECIMAL(38,0))):decimal(38,0)>
 -- !query output
 NULL
 
@@ -181,7 +181,7 @@ NULL
 -- !query
 select sum(udf(CAST(null AS DOUBLE))) from range(1,4)
 -- !query schema
-struct<sum(CAST(udf(cast(cast(null as double) as string)) AS DOUBLE)):double>
+struct<sum(CAST(udf(ansi_cast(ansi_cast(null as double) as string)) AS DOUBLE)):double>
 -- !query output
 NULL
 
@@ -189,7 +189,7 @@ NULL
 -- !query
 select avg(udf(CAST(null AS int))) from range(1,4)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(null as int) as string)) AS INT)):double>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(null as int) as string)) AS INT)):double>
 -- !query output
 NULL
 
@@ -197,7 +197,7 @@ NULL
 -- !query
 select avg(udf(CAST(null AS long))) from range(1,4)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(null as bigint) as string)) AS BIGINT)):double>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(null as bigint) as string)) AS BIGINT)):double>
 -- !query output
 NULL
 
@@ -205,7 +205,7 @@ NULL
 -- !query
 select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(null as decimal(38,0)) as string)) AS DECIMAL(38,0))):decimal(38,4)>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(null as decimal(38,0)) as string)) AS DECIMAL(38,0))):decimal(38,4)>
 -- !query output
 NULL
 
@@ -213,7 +213,7 @@ NULL
 -- !query
 select avg(udf(CAST(null AS DOUBLE))) from range(1,4)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(null as double) as string)) AS DOUBLE)):double>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(null as double) as string)) AS DOUBLE)):double>
 -- !query output
 NULL
 
@@ -221,7 +221,7 @@ NULL
 -- !query
 select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4)
 -- !query schema
-struct<sum(CAST(CAST(udf(cast(NaN as string)) AS STRING) AS DOUBLE)):double>
+struct<sum(CAST(CAST(udf(ansi_cast(NaN as string)) AS STRING) AS DOUBLE)):double>
 -- !query output
 NaN
 
@@ -229,7 +229,7 @@ NaN
 -- !query
 select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4)
 -- !query schema
-struct<avg(CAST(CAST(udf(cast(NaN as string)) AS STRING) AS DOUBLE)):double>
+struct<avg(CAST(CAST(udf(ansi_cast(NaN as string)) AS STRING) AS DOUBLE)):double>
 -- !query output
 NaN
 
@@ -238,7 +238,7 @@ NaN
 SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('Infinity'), ('1')) v(x)
 -- !query schema
-struct<avg(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double>
+struct<avg(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double>
 -- !query output
 Infinity	NaN
 
@@ -247,7 +247,7 @@ Infinity	NaN
 SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('Infinity'), ('Infinity')) v(x)
 -- !query schema
-struct<avg(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double>
+struct<avg(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double>
 -- !query output
 Infinity	NaN
 
@@ -256,7 +256,7 @@ Infinity	NaN
 SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('-Infinity'), ('Infinity')) v(x)
 -- !query schema
-struct<avg(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(cast(x as string)) AS STRING) AS DOUBLE)):double>
+struct<avg(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double,var_pop(CAST(CAST(udf(ansi_cast(x as string)) AS STRING) AS DOUBLE)):double>
 -- !query output
 NaN	NaN
 
@@ -265,7 +265,7 @@ NaN	NaN
 SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
 FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(x as double) as string)) AS DOUBLE)):double,CAST(udf(cast(var_pop(cast(x as double)) as string)) AS DOUBLE):double>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(x as double) as string)) AS DOUBLE)):double,CAST(udf(ansi_cast(var_pop(ansi_cast(x as double)) as string)) AS DOUBLE):double>
 -- !query output
 1.00000005E8	2.5
 
@@ -274,7 +274,7 @@ struct<avg(CAST(udf(cast(cast(x as double) as string)) AS DOUBLE)):double,CAST(u
 SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
 FROM (VALUES (7000000000005), (7000000000007)) v(x)
 -- !query schema
-struct<avg(CAST(udf(cast(cast(x as double) as string)) AS DOUBLE)):double,CAST(udf(cast(var_pop(cast(x as double)) as string)) AS DOUBLE):double>
+struct<avg(CAST(udf(ansi_cast(ansi_cast(x as double) as string)) AS DOUBLE)):double,CAST(udf(ansi_cast(var_pop(ansi_cast(x as double)) as string)) AS DOUBLE):double>
 -- !query output
 7.000000000006E12	1.0
 
@@ -282,7 +282,7 @@ struct<avg(CAST(udf(cast(cast(x as double) as string)) AS DOUBLE)):double,CAST(u
 -- !query
 SELECT udf(covar_pop(b, udf(a))), covar_samp(udf(b), a) FROM aggtest
 -- !query schema
-struct<CAST(udf(cast(covar_pop(cast(b as double), cast(cast(udf(cast(a as string)) as int) as double)) as string)) AS DOUBLE):double,covar_samp(CAST(CAST(udf(cast(b as string)) AS FLOAT) AS DOUBLE), CAST(a AS DOUBLE)):double>
+struct<CAST(udf(ansi_cast(covar_pop(ansi_cast(b as double), ansi_cast(ansi_cast(udf(ansi_cast(a as string)) as int) as double)) as string)) AS DOUBLE):double,covar_samp(CAST(CAST(udf(ansi_cast(b as string)) AS FLOAT) AS DOUBLE), CAST(a AS DOUBLE)):double>
 -- !query output
 653.6289553875104	871.5052738500139
 
@@ -290,7 +290,7 @@ struct<CAST(udf(cast(covar_pop(cast(b as double), cast(cast(udf(cast(a as string
 -- !query
 SELECT corr(b, udf(a)) FROM aggtest
 -- !query schema
-struct<corr(CAST(b AS DOUBLE), CAST(CAST(udf(cast(a as string)) AS INT) AS DOUBLE)):double>
+struct<corr(CAST(b AS DOUBLE), CAST(CAST(udf(ansi_cast(a as string)) AS INT) AS DOUBLE)):double>
 -- !query output
 0.1396345165178734
 
@@ -315,7 +315,7 @@ struct<cnt_4:bigint>
 select ten, udf(count(*)), sum(udf(four)) from onek
 group by ten order by ten
 -- !query schema
-struct<ten:int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint,sum(CAST(udf(cast(four as string)) AS INT)):bigint>
+struct<ten:int,CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint,sum(CAST(udf(ansi_cast(four as string)) AS INT)):bigint>
 -- !query output
 0	100	100
 1	100	200
@@ -333,7 +333,7 @@ struct<ten:int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint,sum(CAST(udf
 select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek
 group by ten order by ten
 -- !query schema
-struct<ten:int,count(CAST(udf(cast(four as string)) AS INT)):bigint,CAST(udf(cast(sum(distinct cast(four as bigint)) as string)) AS BIGINT):bigint>
+struct<ten:int,count(CAST(udf(ansi_cast(four as string)) AS INT)):bigint,CAST(udf(ansi_cast(sum(distinct ansi_cast(four as bigint)) as string)) AS BIGINT):bigint>
 -- !query output
 0	100	2
 1	100	4
@@ -352,7 +352,7 @@ select ten, udf(sum(distinct four)) from onek a
 group by ten
 having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four)
 -- !query schema
-struct<ten:int,CAST(udf(cast(sum(distinct cast(four as bigint)) as string)) AS BIGINT):bigint>
+struct<ten:int,CAST(udf(ansi_cast(sum(distinct ansi_cast(four as bigint)) as string)) AS BIGINT):bigint>
 -- !query output
 0	2
 2	2
@@ -372,7 +372,7 @@ struct<>
 org.apache.spark.sql.AnalysisException
 
 Aggregate/Window/Generate expressions are not valid in where clause of the query.
-Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(CAST(udf(cast(four as string)) AS INT) AS BIGINT))]
+Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(CAST(udf(ansi_cast(four as string)) AS INT) AS BIGINT))]
 Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))];
 
 
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out
index c10fe9b51dd72..d4941d0a0b768 100644
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out
@@ -161,7 +161,7 @@ true	true	false	NULL	false	true
 -- !query
 select min(udf(unique1)) from tenk1
 -- !query schema
-struct<min(CAST(udf(cast(unique1 as string)) AS INT)):int>
+struct<min(CAST(udf(ansi_cast(unique1 as string)) AS INT)):int>
 -- !query output
 0
 
@@ -169,7 +169,7 @@ struct<min(CAST(udf(cast(unique1 as string)) AS INT)):int>
 -- !query
 select udf(max(unique1)) from tenk1
 -- !query schema
-struct<CAST(udf(cast(max(unique1) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(max(unique1) as string)) AS INT):int>
 -- !query output
 9999
 
@@ -217,7 +217,7 @@ struct<min(tenthous):int>
 -- !query
 select distinct max(udf(unique2)) from tenk1
 -- !query schema
-struct<max(CAST(udf(cast(unique2 as string)) AS INT)):int>
+struct<max(CAST(udf(ansi_cast(unique2 as string)) AS INT)):int>
 -- !query output
 9999
 
@@ -241,7 +241,7 @@ struct<max(unique2):int>
 -- !query
 select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1
 -- !query schema
-struct<CAST(udf(cast(max(cast(udf(cast(unique2 as string)) as int)) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(max(ansi_cast(udf(ansi_cast(unique2 as string)) as int)) as string)) AS INT):int>
 -- !query output
 9999
 
@@ -249,7 +249,7 @@ struct<CAST(udf(cast(max(cast(udf(cast(unique2 as string)) as int)) as string))
 -- !query
 select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc
 -- !query schema
-struct<max_unique2:int,CAST(udf(cast(g as string)) AS INT):int>
+struct<max_unique2:int,CAST(udf(ansi_cast(g as string)) AS INT):int>
 -- !query output
 9999	3
 9999	2
@@ -259,6 +259,6 @@ struct<max_unique2:int,CAST(udf(cast(g as string)) AS INT):int>
 -- !query
 select udf(max(100)) from tenk1
 -- !query schema
-struct<CAST(udf(cast(max(100) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(max(100) as string)) AS INT):int>
 -- !query output
 100
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out
index 04c4f54b02a3e..6c733e916d734 100755
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out
@@ -176,7 +176,7 @@ struct<None:string,NULL on no matches:int>
 -- !query
 SELECT CASE WHEN udf(1=0) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END
 -- !query schema
-struct<CASE WHEN CAST(udf(cast((1 = 0) as string)) AS BOOLEAN) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
+struct<CASE WHEN CAST(udf(ansi_cast((1 = 0) as string)) AS BOOLEAN) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
 -- !query output
 1.0
 
@@ -184,7 +184,7 @@ struct<CASE WHEN CAST(udf(cast((1 = 0) as string)) AS BOOLEAN) THEN (CAST(1 AS D
 -- !query
 SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END
 -- !query schema
-struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(cast(0 as string)) AS INT) AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
+struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(ansi_cast(0 as string)) AS INT) AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
 -- !query output
 1.0
 
@@ -192,7 +192,7 @@ struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(cast(0 as strin
 -- !query
 SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl
 -- !query schema
-struct<CASE WHEN (i > 100) THEN CAST(udf(cast((cast(1 as double) / cast(0 as double)) as string)) AS DOUBLE) ELSE CAST(CAST(udf(cast(0 as string)) AS INT) AS DOUBLE) END:double>
+struct<CASE WHEN (i > 100) THEN CAST(udf(ansi_cast((ansi_cast(1 as double) / ansi_cast(0 as double)) as string)) AS DOUBLE) ELSE CAST(CAST(udf(ansi_cast(0 as string)) AS INT) AS DOUBLE) END:double>
 -- !query output
 0.0
 0.0
@@ -203,7 +203,7 @@ struct<CASE WHEN (i > 100) THEN CAST(udf(cast((cast(1 as double) / cast(0 as dou
 -- !query
 SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END
 -- !query schema
-struct<CASE WHEN (a = a) THEN CAST(udf(cast(1 as string)) AS INT) ELSE CAST(udf(cast(2 as string)) AS INT) END:int>
+struct<CASE WHEN (a = a) THEN CAST(udf(ansi_cast(1 as string)) AS INT) ELSE CAST(udf(ansi_cast(2 as string)) AS INT) END:int>
 -- !query output
 1
 
@@ -294,7 +294,7 @@ struct<i:int,f:double>
 SELECT udf(COALESCE(a.f, b.i, b.j))
   FROM CASE_TBL a, CASE2_TBL b
 -- !query schema
-struct<CAST(udf(cast(coalesce(f, cast(i as double), cast(j as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast(coalesce(f, ansi_cast(i as double), ansi_cast(j as double)) as string)) AS DOUBLE):double>
 -- !query output
 -30.3
 -30.3
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out
index f113aee6d3b51..3cc14ff04b1ba 100644
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out
@@ -243,7 +243,7 @@ struct<>
 SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t)
   FROM J1_TBL AS tx
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string>
 -- !query output
 	0	NULL	zero
 	1	4	one
@@ -262,7 +262,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as str
 SELECT udf(udf('')) AS `xxx`, udf(udf(i)), udf(j), udf(t)
   FROM J1_TBL tx
 -- !query schema
-struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string>
+struct<xxx:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(i as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string>
 -- !query output
 	0	NULL	zero
 	1	4	one
@@ -281,7 +281,7 @@ struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string))
 SELECT udf('') AS `xxx`, a, udf(udf(b)), c
   FROM J1_TBL AS t1 (a, b, c)
 -- !query schema
-struct<xxx:string,a:int,CAST(udf(cast(cast(udf(cast(b as string)) as int) as string)) AS INT):int,c:string>
+struct<xxx:string,a:int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(b as string)) as int) as string)) AS INT):int,c:string>
 -- !query output
 	0	NULL	zero
 	1	4	one
@@ -300,7 +300,7 @@ struct<xxx:string,a:int,CAST(udf(cast(cast(udf(cast(b as string)) as int) as str
 SELECT udf('') AS `xxx`, udf(a), udf(b), udf(udf(c))
   FROM J1_TBL t1 (a, b, c)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(a as string)) AS INT):int,CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(c as string)) as string) as string)) AS STRING):string>
+struct<xxx:string,CAST(udf(ansi_cast(a as string)) AS INT):int,CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(c as string)) as string) as string)) AS STRING):string>
 -- !query output
 	0	NULL	zero
 	1	4	one
@@ -319,7 +319,7 @@ struct<xxx:string,CAST(udf(cast(a as string)) AS INT):int,CAST(udf(cast(b as str
 SELECT udf('') AS `xxx`, udf(a), b, udf(c), udf(d), e
   FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(a as string)) AS INT):int,b:int,CAST(udf(cast(c as string)) AS STRING):string,CAST(udf(cast(d as string)) AS INT):int,e:int>
+struct<xxx:string,CAST(udf(ansi_cast(a as string)) AS INT):int,b:int,CAST(udf(ansi_cast(c as string)) AS STRING):string,CAST(udf(ansi_cast(d as string)) AS INT):int,e:int>
 -- !query output
 	0	NULL	zero	0	NULL
 	0	NULL	zero	1	-1
@@ -543,7 +543,7 @@ Reference 'i' is ambiguous, could be: default.j1_tbl.i, default.j2_tbl.i.; line
 SELECT udf('') AS `xxx`, udf(t1.i) AS i, udf(k), udf(t)
   FROM J1_TBL t1 CROSS JOIN J2_TBL t2
 -- !query schema
-struct<xxx:string,i:int,CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string>
+struct<xxx:string,i:int,CAST(udf(ansi_cast(k as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string>
 -- !query output
 	0	-1	zero
 	0	-3	zero
@@ -651,7 +651,7 @@ SELECT udf(udf('')) AS `xxx`, udf(udf(ii)) AS ii, udf(udf(tt)) AS tt, udf(udf(kk
   FROM (J1_TBL CROSS JOIN J2_TBL)
     AS tx (ii, jj, tt, ii2, kk)
 -- !query schema
-struct<xxx:string,ii:int,tt:string,CAST(udf(cast(cast(udf(cast(kk as string)) as int) as string)) AS INT):int>
+struct<xxx:string,ii:int,tt:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(kk as string)) as int) as string)) AS INT):int>
 -- !query output
 	0	zero	-1
 	0	zero	-3
@@ -758,7 +758,7 @@ struct<xxx:string,ii:int,tt:string,CAST(udf(cast(cast(udf(cast(kk as string)) as
 SELECT udf('') AS `xxx`, udf(udf(j1_tbl.i)), udf(j), udf(t), udf(a.i), udf(a.k), udf(b.i),  udf(b.k)
   FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b
 -- !query schema
-struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(i as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(k as string)) AS INT):int,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	0	NULL	0	NULL
 	0	NULL	zero	0	NULL	1	-1
@@ -1657,7 +1657,7 @@ struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string))
 SELECT udf('') AS `xxx`, udf(i) AS i, udf(j), udf(t) AS t, udf(k)
   FROM J1_TBL INNER JOIN J2_TBL USING (i)
 -- !query schema
-struct<xxx:string,i:int,CAST(udf(cast(j as string)) AS INT):int,t:string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,i:int,CAST(udf(ansi_cast(j as string)) AS INT):int,t:string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1672,7 +1672,7 @@ struct<xxx:string,i:int,CAST(udf(cast(j as string)) AS INT):int,t:string,CAST(ud
 SELECT udf(udf('')) AS `xxx`, udf(i), udf(j) AS j, udf(t), udf(k) AS k
   FROM J1_TBL JOIN J2_TBL USING (i)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,j:int,CAST(udf(cast(t as string)) AS STRING):string,k:int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,j:int,CAST(udf(ansi_cast(t as string)) AS STRING):string,k:int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1703,7 +1703,7 @@ struct<xxx:string,a:int,b:int,c:string,d:int>
 SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k)
   FROM J1_TBL NATURAL JOIN J2_TBL
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1718,7 +1718,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as str
 SELECT udf('') AS `xxx`, udf(udf(udf(a))) AS a, udf(b), udf(c), udf(d)
   FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d)
 -- !query schema
-struct<xxx:string,a:int,CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(c as string)) AS STRING):string,CAST(udf(cast(d as string)) AS INT):int>
+struct<xxx:string,a:int,CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(c as string)) AS STRING):string,CAST(udf(ansi_cast(d as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1733,7 +1733,7 @@ struct<xxx:string,a:int,CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(c
 SELECT udf('') AS `xxx`, udf(udf(a)), udf(udf(b)), udf(udf(c)) AS c, udf(udf(udf(d))) AS d
   FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(cast(udf(cast(a as string)) as int) as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(b as string)) as int) as string)) AS INT):int,c:string,d:int>
+struct<xxx:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(a as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(b as string)) as int) as string)) AS INT):int,c:string,d:int>
 -- !query output
 	0	NULL	zero	NULL
 	2	3	two	2
@@ -1744,7 +1744,7 @@ struct<xxx:string,CAST(udf(cast(cast(udf(cast(a as string)) as int) as string))
 SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(udf(J1_TBL.j)), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k)
   FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) = J2_TBL.i)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(j as string)) as int) as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(j as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	0	NULL
 	1	4	one	1	-1
@@ -1759,7 +1759,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(cast(udf
 SELECT udf('') AS `xxx`, udf(udf(J1_TBL.i)), udf(udf(J1_TBL.j)), udf(udf(J1_TBL.t)), J2_TBL.i, J2_TBL.k
   FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = udf(J2_TBL.k))
 -- !query schema
-struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(j as string)) as int) as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(t as string)) as string) as string)) AS STRING):string,i:int,k:int>
+struct<xxx:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(i as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(j as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(t as string)) as string) as string)) AS STRING):string,i:int,k:int>
 -- !query output
 	0	NULL	zero	NULL	0
 	2	3	two	2	2
@@ -1770,7 +1770,7 @@ struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string))
 SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k)
   FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) <= udf(udf(J2_TBL.k)))
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	2	2
 	0	NULL	zero	2	4
@@ -1788,7 +1788,7 @@ SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k)
   FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i)
   ORDER BY udf(udf(i)), udf(k), udf(t)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	NULL	NULL	null	NULL
 	NULL	0	zero	NULL
@@ -1810,7 +1810,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
   FROM J1_TBL LEFT JOIN J2_TBL USING (i)
   ORDER BY udf(i), udf(udf(k)), udf(t)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	NULL	NULL	null	NULL
 	NULL	0	zero	NULL
@@ -1831,7 +1831,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as str
 SELECT udf('') AS `xxx`, udf(udf(i)), udf(j), udf(t), udf(k)
   FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(i as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1848,7 +1848,7 @@ struct<xxx:string,CAST(udf(cast(cast(udf(cast(i as string)) as int) as string))
 SELECT udf('') AS `xxx`, udf(i), udf(udf(j)), udf(t), udf(k)
   FROM J1_TBL RIGHT JOIN J2_TBL USING (i)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(j as string)) as int) as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(j as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	0	NULL	zero	NULL
 	1	4	one	-1
@@ -1866,7 +1866,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(udf(t)), udf(k)
   FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i)
   ORDER BY udf(udf(i)), udf(k), udf(t)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(t as string)) as string) as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(t as string)) as string) as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	NULL	NULL	NULL	NULL
 	NULL	NULL	null	NULL
@@ -1890,7 +1890,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), t, udf(udf(k))
   FROM J1_TBL FULL JOIN J2_TBL USING (i)
   ORDER BY udf(udf(i)), udf(k), udf(udf(t))
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,t:string,CAST(udf(cast(cast(udf(cast(k as string)) as int) as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,t:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(k as string)) as int) as string)) AS INT):int>
 -- !query output
 	NULL	NULL	NULL	NULL
 	NULL	NULL	null	NULL
@@ -1913,7 +1913,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as str
 SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(udf(k))
   FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(k) = 1)
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(cast(udf(cast(k as string)) as int) as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(k as string)) as int) as string)) AS INT):int>
 -- !query output
 
 
@@ -1922,7 +1922,7 @@ struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as str
 SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
   FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(udf(i)) = udf(1))
 -- !query schema
-struct<xxx:string,CAST(udf(cast(i as string)) AS INT):int,CAST(udf(cast(j as string)) AS INT):int,CAST(udf(cast(t as string)) AS STRING):string,CAST(udf(cast(k as string)) AS INT):int>
+struct<xxx:string,CAST(udf(ansi_cast(i as string)) AS INT):int,CAST(udf(ansi_cast(j as string)) AS INT):int,CAST(udf(ansi_cast(t as string)) AS STRING):string,CAST(udf(ansi_cast(k as string)) AS INT):int>
 -- !query output
 	1	4	one	-1
 
@@ -2052,7 +2052,7 @@ FULL JOIN
 (SELECT * FROM t3) s3
 USING (name)
 -- !query schema
-struct<CAST(udf(cast(name as string)) AS STRING):string,CAST(udf(cast(cast(udf(cast(n as string)) as int) as string)) AS INT):int,CAST(udf(cast(n as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(name as string)) AS STRING):string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(n as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(n as string)) AS INT):int>
 -- !query output
 bb	12	13
 cc	22	23
@@ -2138,7 +2138,7 @@ NATURAL FULL JOIN
     (SELECT name, udf(udf(n)) as s3_n FROM t3) as s3
   ) ss2
 -- !query schema
-struct<name:string,CAST(udf(cast(cast(udf(cast(s1_n as string)) as int) as string)) AS INT):int,CAST(udf(cast(s2_n as string)) AS INT):int,CAST(udf(cast(s3_n as string)) AS INT):int>
+struct<name:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(s1_n as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(s2_n as string)) AS INT):int,CAST(udf(ansi_cast(s3_n as string)) AS INT):int>
 -- !query output
 bb	11	12	13
 cc	NULL	22	23
@@ -2171,7 +2171,7 @@ FULL JOIN
   (SELECT name, 2 as s2_n FROM t2) as s2
 ON (udf(udf(s1_n)) = udf(s2_n))
 -- !query schema
-struct<name:string,CAST(udf(cast(s1_n as string)) AS INT):int,name:string,CAST(udf(cast(cast(udf(cast(s2_n as string)) as int) as string)) AS INT):int>
+struct<name:string,CAST(udf(ansi_cast(s1_n as string)) AS INT):int,name:string,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(s2_n as string)) as int) as string)) AS INT):int>
 -- !query output
 NULL	NULL	bb	2
 NULL	NULL	cc	2
@@ -2202,7 +2202,7 @@ struct<>
 -- !query
 select udf(udf(x1)), udf(x2) from x
 -- !query schema
-struct<CAST(udf(cast(cast(udf(cast(x1 as string)) as int) as string)) AS INT):int,CAST(udf(cast(x2 as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(x1 as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(x2 as string)) AS INT):int>
 -- !query output
 1	11
 2	22
@@ -2214,7 +2214,7 @@ struct<CAST(udf(cast(cast(udf(cast(x1 as string)) as int) as string)) AS INT):in
 -- !query
 select udf(y1), udf(udf(y2)) from y
 -- !query schema
-struct<CAST(udf(cast(y1 as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(y2 as string)) as int) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(y1 as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(y2 as string)) as int) as string)) AS INT):int>
 -- !query output
 1	111
 2	222
@@ -2336,7 +2336,7 @@ select udf(udf(count(*))) from tenk1 a where udf(udf(unique1)) in
   (select udf(unique1) from tenk1 b join tenk1 c using (unique1)
    where udf(udf(b.unique2)) = udf(42))
 -- !query schema
-struct<CAST(udf(cast(cast(udf(cast(count(1) as string)) as bigint) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(count(1) as string)) as bigint) as string)) AS BIGINT):bigint>
 -- !query output
 1
 
@@ -2347,7 +2347,7 @@ select udf(count(*)) from tenk1 x where
   udf(x.unique1) = 0 and
   udf(x.unique1) in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=udf(udf(bb.f1)))
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1
 
@@ -2358,7 +2358,7 @@ select udf(udf(count(*))) from tenk1 x where
   udf(x.unique1) = 0 and
   udf(udf(x.unique1)) in (select udf(aa.f1) from int4_tbl aa,float8_tbl bb where udf(aa.f1)=udf(udf(bb.f1)))
 -- !query schema
-struct<CAST(udf(cast(cast(udf(cast(count(1) as string)) as bigint) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(count(1) as string)) as bigint) as string)) AS BIGINT):bigint>
 -- !query output
 1
 
@@ -2388,7 +2388,7 @@ from
   tenk1 t5
 where udf(t4.thousand) = udf(t5.unique1) and udf(udf(ss.x1)) = t4.tenthous and udf(ss.x2) = udf(udf(t5.stringu1))
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1000
 
@@ -2400,7 +2400,7 @@ select udf(a.f1), udf(b.f1), udf(t.thousand), udf(t.tenthous) from
   (select udf(sum(udf(f1))) as f1 from int4_tbl i4b) b
 where b.f1 = udf(t.thousand) and udf(a.f1) = udf(b.f1) and udf((udf(a.f1)+udf(b.f1)+999)) = udf(udf(t.tenthous))
 -- !query schema
-struct<CAST(udf(cast(f1 as string)) AS BIGINT):bigint,CAST(udf(cast(f1 as string)) AS BIGINT):bigint,CAST(udf(cast(thousand as string)) AS INT):int,CAST(udf(cast(tenthous as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(f1 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(f1 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(thousand as string)) AS INT):int,CAST(udf(ansi_cast(tenthous as string)) AS INT):int>
 -- !query output
 
 
@@ -2441,7 +2441,7 @@ select udf(count(*)) from
   (select * from tenk1 y order by udf(y.unique2)) y
   on udf(x.thousand) = y.unique2 and x.twothousand = udf(y.hundred) and x.fivethous = y.unique2
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 10000
 
@@ -2530,7 +2530,7 @@ struct<tt1_id:int,joincol:int,tt2_id:int,joincol:int>
 select udf(count(*)) from tenk1 a, tenk1 b
   where udf(a.hundred) = b.thousand and udf(udf((b.fivethous % 10)) < 10)
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 100000
 
@@ -2727,7 +2727,7 @@ from tenk1 a left join tenk1 b on a.unique2 = udf(b.tenthous)
 where udf(a.unique1) = 42 and
       ((udf(b.unique2) is null and udf(a.ten) = 2) or udf(udf(b.hundred)) = udf(udf(3)))
 -- !query schema
-struct<CAST(udf(cast(unique2 as string)) AS INT):int,CAST(udf(cast(ten as string)) AS INT):int,CAST(udf(cast(tenthous as string)) AS INT):int,CAST(udf(cast(unique2 as string)) AS INT):int,CAST(udf(cast(hundred as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(unique2 as string)) AS INT):int,CAST(udf(ansi_cast(ten as string)) AS INT):int,CAST(udf(ansi_cast(tenthous as string)) AS INT):int,CAST(udf(ansi_cast(unique2 as string)) AS INT):int,CAST(udf(ansi_cast(hundred as string)) AS INT):int>
 -- !query output
 
 
@@ -2761,7 +2761,7 @@ select udf(t1.q2), udf(count(t2.*))
 from int8_tbl t1 left join int8_tbl t2 on (udf(udf(t1.q2)) = t2.q1)
 group by udf(t1.q2) order by 1
 -- !query schema
-struct<CAST(udf(cast(q2 as string)) AS BIGINT):bigint,CAST(udf(cast(count(q1, q2) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(q2 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(count(q1, q2) as string)) AS BIGINT):bigint>
 -- !query output
 -4567890123456789	0
 123	2
@@ -2774,7 +2774,7 @@ select udf(udf(t1.q2)), udf(count(t2.*))
 from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(udf(t1.q2)) = udf(t2.q1))
 group by udf(udf(t1.q2)) order by 1
 -- !query schema
-struct<CAST(udf(cast(cast(udf(cast(q2 as string)) as bigint) as string)) AS BIGINT):bigint,CAST(udf(cast(count(q1, q2) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(q2 as string)) as bigint) as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(count(q1, q2) as string)) AS BIGINT):bigint>
 -- !query output
 -4567890123456789	0
 123	2
@@ -2789,7 +2789,7 @@ from int8_tbl t1 left join
   on (udf(t1.q2) = udf(t2.q1))
 group by t1.q2 order by 1
 -- !query schema
-struct<q2:bigint,CAST(udf(cast(cast(udf(cast(count(q1, q2) as string)) as bigint) as string)) AS BIGINT):bigint>
+struct<q2:bigint,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(count(q1, q2) as string)) as bigint) as string)) AS BIGINT):bigint>
 -- !query output
 -4567890123456789	0
 123	2
@@ -2838,7 +2838,7 @@ from c left join
   on (udf(udf(c.a)) = udf(ss.code))
 order by c.name
 -- !query schema
-struct<CAST(udf(cast(name as string)) AS STRING):string,CAST(udf(cast(code as string)) AS STRING):string,CAST(udf(cast(b_cnt as string)) AS BIGINT):bigint,CAST(udf(cast(const as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(name as string)) AS STRING):string,CAST(udf(ansi_cast(code as string)) AS STRING):string,CAST(udf(ansi_cast(b_cnt as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(const as string)) AS INT):int>
 -- !query output
 A	p	2	-1
 B	q	0	-1
@@ -2884,7 +2884,7 @@ LEFT JOIN
 ) sub2
 ON sub1.key1 = udf(udf(sub2.key3))
 -- !query schema
-struct<key1:int,key3:int,CAST(udf(cast(value2 as string)) AS INT):int,value3:int>
+struct<key1:int,key3:int,CAST(udf(ansi_cast(value2 as string)) AS INT):int,value3:int>
 -- !query output
 1	1	1	1
 
@@ -2898,7 +2898,7 @@ SELECT udf(qq), udf(udf(unique1))
   USING (qq)
   INNER JOIN tenk1 c ON udf(qq) = udf(unique2)
 -- !query schema
-struct<CAST(udf(cast(qq as string)) AS BIGINT):bigint,CAST(udf(cast(cast(udf(cast(unique1 as string)) as int) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(qq as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(unique1 as string)) as int) as string)) AS INT):int>
 -- !query output
 123	4596
 123	4596
@@ -2948,7 +2948,7 @@ from nt3 as nt3
     on udf(ss2.id) = nt3.nt2_id
 where udf(nt3.id) = 1 and udf(ss2.b3)
 -- !query schema
-struct<CAST(udf(cast(id as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(id as string)) AS INT):int>
 -- !query output
 1
 
@@ -3008,7 +3008,7 @@ select udf(count(*)) from
   left join tenk1 c on udf(a.unique2) = udf(b.unique1) and udf(c.thousand) = udf(udf(a.thousand))
   join int4_tbl on udf(b.thousand) = f1
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 10
 
@@ -3021,7 +3021,7 @@ select udf(b.unique1) from
   right join int4_tbl i2 on udf(udf(i2.f1)) = udf(b.tenthous)
   order by udf(1)
 -- !query schema
-struct<CAST(udf(cast(unique1 as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(unique1 as string)) AS INT):int>
 -- !query output
 NULL
 NULL
@@ -3039,7 +3039,7 @@ select * from
 where udf(fault) = udf(122)
 order by udf(fault)
 -- !query schema
-struct<CAST(udf(cast(unique1 as string)) AS INT):int,CAST(udf(cast(q1 as string)) AS BIGINT):bigint,fault:bigint>
+struct<CAST(udf(ansi_cast(unique1 as string)) AS INT):int,CAST(udf(ansi_cast(q1 as string)) AS BIGINT):bigint,fault:bigint>
 -- !query output
 NULL	123	122
 
@@ -3049,7 +3049,7 @@ select udf(q1), udf(unique2), udf(thousand), udf(hundred)
   from int8_tbl a left join tenk1 b on udf(q1) = udf(unique2)
   where udf(coalesce(thousand,123)) = udf(q1) and udf(q1) = udf(udf(coalesce(hundred,123)))
 -- !query schema
-struct<CAST(udf(cast(q1 as string)) AS BIGINT):bigint,CAST(udf(cast(unique2 as string)) AS INT):int,CAST(udf(cast(thousand as string)) AS INT):int,CAST(udf(cast(hundred as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(q1 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(unique2 as string)) AS INT):int,CAST(udf(ansi_cast(thousand as string)) AS INT):int,CAST(udf(ansi_cast(hundred as string)) AS INT):int>
 -- !query output
 
 
@@ -3059,7 +3059,7 @@ select udf(f1), udf(unique2), case when udf(udf(unique2)) is null then udf(f1) e
   from int4_tbl a left join tenk1 b on udf(f1) = udf(udf(unique2))
   where (case when udf(unique2) is null then udf(f1) else 0 end) = 0
 -- !query schema
-struct<CAST(udf(cast(f1 as string)) AS INT):int,CAST(udf(cast(unique2 as string)) AS INT):int,CASE WHEN (CAST(udf(cast(cast(udf(cast(unique2 as string)) as int) as string)) AS INT) IS NULL) THEN CAST(udf(cast(f1 as string)) AS INT) ELSE 0 END:int>
+struct<CAST(udf(ansi_cast(f1 as string)) AS INT):int,CAST(udf(ansi_cast(unique2 as string)) AS INT):int,CASE WHEN (CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(unique2 as string)) as int) as string)) AS INT) IS NULL) THEN CAST(udf(ansi_cast(f1 as string)) AS INT) ELSE 0 END:int>
 -- !query output
 0	0	0
 
@@ -3069,7 +3069,7 @@ select udf(a.unique1), udf(b.unique1), udf(c.unique1), udf(coalesce(b.twothousan
   from tenk1 a left join tenk1 b on udf(b.thousand) = a.unique1                       left join tenk1 c on udf(c.unique2) = udf(coalesce(b.twothousand, a.twothousand))
   where a.unique2 < udf(10) and udf(udf(coalesce(b.twothousand, a.twothousand))) = udf(44)
 -- !query schema
-struct<CAST(udf(cast(unique1 as string)) AS INT):int,CAST(udf(cast(unique1 as string)) AS INT):int,CAST(udf(cast(unique1 as string)) AS INT):int,CAST(udf(cast(coalesce(twothousand, twothousand) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(unique1 as string)) AS INT):int,CAST(udf(ansi_cast(unique1 as string)) AS INT):int,CAST(udf(ansi_cast(unique1 as string)) AS INT):int,CAST(udf(ansi_cast(coalesce(twothousand, twothousand) as string)) AS INT):int>
 -- !query output
 
 
@@ -3107,7 +3107,7 @@ select udf(a.q2), udf(b.q1)
   from int8_tbl a left join int8_tbl b on udf(a.q2) = coalesce(b.q1, 1)
   where udf(udf(coalesce(b.q1, 1)) > 0)
 -- !query schema
-struct<CAST(udf(cast(q2 as string)) AS BIGINT):bigint,CAST(udf(cast(q1 as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(q2 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(q1 as string)) AS BIGINT):bigint>
 -- !query output
 -4567890123456789	NULL
 123	123
@@ -3237,7 +3237,7 @@ SELECT * FROM
      FROM int8_tbl LEFT JOIN innertab ON udf(udf(q2)) = id) ss2
   ON true
 -- !query schema
-struct<x:int,CAST(udf(cast(q1 as string)) AS BIGINT):bigint,CAST(udf(cast(q2 as string)) AS BIGINT):bigint,y:bigint>
+struct<x:int,CAST(udf(ansi_cast(q1 as string)) AS BIGINT):bigint,CAST(udf(ansi_cast(q2 as string)) AS BIGINT):bigint,y:bigint>
 -- !query output
 1	123	456	123
 1	123	4567890123456789	123
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out
index 68113afdfae30..bb108a29f75ce 100644
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out
@@ -94,7 +94,7 @@ struct<>
 SELECT udf(b), udf(c) FROM test_having
 	GROUP BY b, c HAVING udf(count(*)) = 1 ORDER BY udf(b), udf(c)
 -- !query schema
-struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(c as string)) AS STRING):string>
+struct<CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(c as string)) AS STRING):string>
 -- !query output
 1	XXXX
 3	bbbb
@@ -104,7 +104,7 @@ struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(c as string)) AS ST
 SELECT udf(b), udf(c) FROM test_having
 	GROUP BY b, c HAVING udf(b) = 3 ORDER BY udf(b), udf(c)
 -- !query schema
-struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(c as string)) AS STRING):string>
+struct<CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(c as string)) AS STRING):string>
 -- !query output
 3	BBBB
 3	bbbb
@@ -115,7 +115,7 @@ SELECT udf(c), max(udf(a)) FROM test_having
 	GROUP BY c HAVING udf(count(*)) > 2 OR udf(min(a)) = udf(max(a))
 	ORDER BY c
 -- !query schema
-struct<CAST(udf(cast(c as string)) AS STRING):string,max(CAST(udf(cast(a as string)) AS INT)):int>
+struct<CAST(udf(ansi_cast(c as string)) AS STRING):string,max(CAST(udf(ansi_cast(a as string)) AS INT)):int>
 -- !query output
 XXXX	0
 bbbb	5
@@ -124,7 +124,7 @@ bbbb	5
 -- !query
 SELECT udf(udf(min(udf(a)))), udf(udf(max(udf(a)))) FROM test_having HAVING udf(udf(min(udf(a)))) = udf(udf(max(udf(a))))
 -- !query schema
-struct<CAST(udf(cast(cast(udf(cast(min(cast(udf(cast(a as string)) as int)) as string)) as int) as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(max(cast(udf(cast(a as string)) as int)) as string)) as int) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(min(ansi_cast(udf(ansi_cast(a as string)) as int)) as string)) as int) as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(max(ansi_cast(udf(ansi_cast(a as string)) as int)) as string)) as int) as string)) AS INT):int>
 -- !query output
 
 
@@ -132,7 +132,7 @@ struct<CAST(udf(cast(cast(udf(cast(min(cast(udf(cast(a as string)) as int)) as s
 -- !query
 SELECT udf(min(udf(a))), udf(udf(max(a))) FROM test_having HAVING udf(min(a)) < udf(max(udf(a)))
 -- !query schema
-struct<CAST(udf(cast(min(cast(udf(cast(a as string)) as int)) as string)) AS INT):int,CAST(udf(cast(cast(udf(cast(max(a) as string)) as int) as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(min(ansi_cast(udf(ansi_cast(a as string)) as int)) as string)) AS INT):int,CAST(udf(ansi_cast(ansi_cast(udf(ansi_cast(max(a) as string)) as int) as string)) AS INT):int>
 -- !query output
 0	9
 
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out
index 11cb682ee1494..66e6c20a2f6f2 100755
--- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out
@@ -95,7 +95,7 @@ SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY
 udf(test_missing_target.c)
 ORDER BY udf(c)
 -- !query schema
-struct<CAST(udf(cast(c as string)) AS STRING):string,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(c as string)) AS STRING):string,CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 ABAB	2
 BBBB	2
@@ -109,7 +109,7 @@ cccc	2
 SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c)
 ORDER BY udf(c)
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 2
 2
@@ -125,13 +125,13 @@ SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b)
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 75
+cannot resolve '`b`' given input columns: [CAST(udf(ansi_cast(count(1) as string)) AS BIGINT)]; line 1 pos 75
 
 
 -- !query
 SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b)
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1
 2
@@ -143,7 +143,7 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
 SELECT udf(test_missing_target.b), udf(count(*))
   FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b)
 -- !query schema
-struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1	1
 2	2
@@ -154,7 +154,7 @@ struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)
 -- !query
 SELECT udf(c) FROM test_missing_target ORDER BY udf(a)
 -- !query schema
-struct<CAST(udf(cast(c as string)) AS STRING):string>
+struct<CAST(udf(ansi_cast(c as string)) AS STRING):string>
 -- !query output
 XXXX
 ABAB
@@ -171,7 +171,7 @@ CCCC
 -- !query
 SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 4
 3
@@ -182,7 +182,7 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
 -- !query
 SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 10
 
@@ -190,7 +190,7 @@ struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
 -- !query
 SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 1 ORDER BY 1
 -- !query schema
-struct<CAST(udf(cast(c as string)) AS STRING):string,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(c as string)) AS STRING):string,CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 ABAB	2
 BBBB	2
@@ -224,7 +224,7 @@ Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14
 SELECT udf(a), udf(a) FROM test_missing_target
 	ORDER BY udf(a)
 -- !query schema
-struct<CAST(udf(cast(a as string)) AS INT):int,CAST(udf(cast(a as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(a as string)) AS INT):int,CAST(udf(ansi_cast(a as string)) AS INT):int>
 -- !query output
 0	0
 1	1
@@ -242,7 +242,7 @@ struct<CAST(udf(cast(a as string)) AS INT):int,CAST(udf(cast(a as string)) AS IN
 SELECT udf(udf(a)/2), udf(udf(a)/2) FROM test_missing_target
 	ORDER BY udf(udf(a)/2)
 -- !query schema
-struct<CAST(udf(cast((cast(cast(udf(cast(a as string)) as int) as double) / cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(cast((cast(cast(udf(cast(a as string)) as int) as double) / cast(2 as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast((ansi_cast(ansi_cast(udf(ansi_cast(a as string)) as int) as double) / ansi_cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(ansi_cast((ansi_cast(ansi_cast(udf(ansi_cast(a as string)) as int) as double) / ansi_cast(2 as double)) as string)) AS DOUBLE):double>
 -- !query output
 0.0	0.0
 0.5	0.5
@@ -260,7 +260,7 @@ struct<CAST(udf(cast((cast(cast(udf(cast(a as string)) as int) as double) / cast
 SELECT udf(a/2), udf(a/2) FROM test_missing_target
 	GROUP BY udf(a/2) ORDER BY udf(a/2)
 -- !query schema
-struct<CAST(udf(cast((cast(a as double) / cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(cast((cast(a as double) / cast(2 as double)) as string)) AS DOUBLE):double>
+struct<CAST(udf(ansi_cast((ansi_cast(a as double) / ansi_cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(ansi_cast((ansi_cast(a as double) / ansi_cast(2 as double)) as string)) AS DOUBLE):double>
 -- !query output
 0.0	0.0
 0.5	0.5
@@ -279,7 +279,7 @@ SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y
 	WHERE udf(x.a) = udf(y.a)
 	GROUP BY udf(x.b) ORDER BY udf(x.b)
 -- !query schema
-struct<CAST(udf(cast(b as string)) AS INT):int,CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(b as string)) AS INT):int,CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1	1
 2	2
@@ -292,7 +292,7 @@ SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y
 	WHERE udf(x.a) = udf(y.a)
 	GROUP BY udf(x.b) ORDER BY udf(x.b)
 -- !query schema
-struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(1) as string)) AS BIGINT):bigint>
 -- !query output
 1
 2
@@ -305,7 +305,7 @@ SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target
 GROUP BY udf(test_missing_target.a%2)
 ORDER BY udf(test_missing_target.a%2)
 -- !query schema
-struct<CAST(udf(cast((a % 2) as string)) AS INT):int,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast((a % 2) as string)) AS INT):int,CAST(udf(ansi_cast(count(ansi_cast(udf(ansi_cast(b as string)) as int)) as string)) AS BIGINT):bigint>
 -- !query output
 0	5
 1	5
@@ -316,7 +316,7 @@ SELECT udf(count(c)) FROM test_missing_target
 GROUP BY udf(lower(test_missing_target.c))
 ORDER BY udf(lower(test_missing_target.c))
 -- !query schema
-struct<CAST(udf(cast(count(c) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(c) as string)) AS BIGINT):bigint>
 -- !query output
 2
 3
@@ -330,13 +330,13 @@ SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80
+cannot resolve '`b`' given input columns: [CAST(udf(ansi_cast(count(ansi_cast(udf(ansi_cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80
 
 
 -- !query
 SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2)
 -- !query schema
-struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(b) as string)) AS BIGINT):bigint>
 -- !query output
 1
 2
@@ -348,7 +348,7 @@ struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
 SELECT udf(lower(test_missing_target.c)), udf(count(udf(c)))
   FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c))
 -- !query schema
-struct<CAST(udf(cast(lower(c) as string)) AS STRING):string,CAST(udf(cast(count(cast(udf(cast(c as string)) as string)) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(lower(c) as string)) AS STRING):string,CAST(udf(ansi_cast(count(ansi_cast(udf(ansi_cast(c as string)) as string)) as string)) AS BIGINT):bigint>
 -- !query output
 abab	2
 bbbb	3
@@ -359,7 +359,7 @@ xxxx	1
 -- !query
 SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d)))
 -- !query schema
-struct<CAST(udf(cast(a as string)) AS INT):int>
+struct<CAST(udf(ansi_cast(a as string)) AS INT):int>
 -- !query output
 0
 1
@@ -377,7 +377,7 @@ struct<CAST(udf(cast(a as string)) AS INT):int>
 SELECT udf(count(b)) FROM test_missing_target
 	GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc
 -- !query schema
-struct<CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast(count(b) as string)) AS BIGINT):bigint>
 -- !query output
 4
 3
@@ -402,7 +402,7 @@ test_missing_target y
 	WHERE udf(x.a) = udf(y.a)
 	GROUP BY udf(x.b/2) ORDER BY udf(x.b/2)
 -- !query schema
-struct<CAST(udf(cast((cast(b as double) / cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(cast(count(cast(udf(cast(b as string)) as int)) as string)) AS BIGINT):bigint>
+struct<CAST(udf(ansi_cast((ansi_cast(b as double) / ansi_cast(2 as double)) as string)) AS DOUBLE):double,CAST(udf(ansi_cast(count(ansi_cast(udf(ansi_cast(b as string)) as int)) as string)) AS BIGINT):bigint>
 -- !query output
 0.5	1
 1.0	2

From 68d7edf9497bea2f73707d32ab55dd8e53088e7c Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Tue, 18 Feb 2020 20:39:50 +0800
Subject: [PATCH 086/185] [SPARK-30812][SQL][CORE] Revise boolean config name
 to comply with new config naming policy

### What changes were proposed in this pull request?

Revise below config names to comply with [new config naming policy](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-naming-policy-of-Spark-configs-td28875.html):

SQL:
* spark.sql.execution.subquery.reuse.enabled / [SPARK-27083](https://issues.apache.org/jira/browse/SPARK-27083)
* spark.sql.legacy.allowNegativeScaleOfDecimal.enabled / [SPARK-30252](https://issues.apache.org/jira/browse/SPARK-30252)
* spark.sql.adaptive.optimizeSkewedJoin.enabled / [SPARK-29544](https://issues.apache.org/jira/browse/SPARK-29544)
* spark.sql.legacy.property.nonReserved / [SPARK-30183](https://issues.apache.org/jira/browse/SPARK-30183)
* spark.sql.streaming.forceDeleteTempCheckpointLocation.enabled / [SPARK-26389](https://issues.apache.org/jira/browse/SPARK-26389)
* spark.sql.analyzer.failAmbiguousSelfJoin.enabled / [SPARK-28344](https://issues.apache.org/jira/browse/SPARK-28344)
* spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled / [SPARK-30074](https://issues.apache.org/jira/browse/SPARK-30074)
* spark.sql.execution.pandas.arrowSafeTypeConversion / [SPARK-25811](https://issues.apache.org/jira/browse/SPARK-25811)
* spark.sql.legacy.looseUpcast / [SPARK-24586](https://issues.apache.org/jira/browse/SPARK-24586)
* spark.sql.legacy.arrayExistsFollowsThreeValuedLogic / [SPARK-28052](https://issues.apache.org/jira/browse/SPARK-28052)
* spark.sql.sources.ignoreDataLocality.enabled / [SPARK-29189](https://issues.apache.org/jira/browse/SPARK-29189)
* spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch.enabled / [SPARK-9853](https://issues.apache.org/jira/browse/SPARK-9853)

CORE:
* spark.eventLog.erasureCoding.enabled / [SPARK-25855](https://issues.apache.org/jira/browse/SPARK-25855)
* spark.shuffle.readHostLocalDisk.enabled / [SPARK-30235](https://issues.apache.org/jira/browse/SPARK-30235)
* spark.scheduler.listenerbus.logSlowEvent.enabled / [SPARK-29001](https://issues.apache.org/jira/browse/SPARK-29001)
* spark.resources.coordinate.enable / [SPARK-27371](https://issues.apache.org/jira/browse/SPARK-27371)
* spark.eventLog.logStageExecutorMetrics.enabled / [SPARK-23429](https://issues.apache.org/jira/browse/SPARK-23429)

### Why are the changes needed?

To comply with the config naming policy.

### Does this PR introduce any user-facing change?

No. Configurations listed above are all newly added in Spark 3.0.

### How was this patch tested?

Pass Jenkins.

Closes #27563 from Ngone51/revise_boolean_conf_name.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/internal/config/package.scala       | 30 +++++++++----------
 .../scheduler/EventLoggingListener.scala      |  2 +-
 .../application_1553914137147_0018            |  2 +-
 docs/configuration.md                         |  8 ++---
 docs/monitoring.md                            |  2 +-
 docs/pyspark-migration-guide.md               |  2 +-
 docs/spark-standalone.md                      |  8 ++---
 docs/sql-migration-guide.md                   |  8 ++---
 python/pyspark/sql/pandas/serializers.py      |  2 +-
 python/pyspark/sql/tests/test_pandas_udf.py   |  8 ++---
 python/pyspark/sql/tests/test_types.py        |  4 +--
 python/pyspark/worker.py                      |  2 +-
 .../apache/spark/sql/internal/SQLConf.scala   | 24 +++++++--------
 .../apache/spark/sql/types/DecimalType.scala  |  2 +-
 14 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 652db2bdf90a9..3f36e61fe1127 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -38,7 +38,7 @@ package object config {
   private[spark] val LISTENER_BUS_EVENT_QUEUE_PREFIX = "spark.scheduler.listenerbus.eventqueue"
 
   private[spark] val SPARK_RESOURCES_COORDINATE =
-    ConfigBuilder("spark.resources.coordinate.enable")
+    ConfigBuilder("spark.resources.coordinateResourcesInStandalone")
       .doc("Whether to coordinate resources automatically among workers/drivers(client only) " +
         "in Standalone. If false, the user is responsible for configuring different resources " +
         "for workers/drivers that run on the same host.")
@@ -159,7 +159,7 @@ package object config {
     .createWithDefaultString("100k")
 
   private[spark] val EVENT_LOG_STAGE_EXECUTOR_METRICS =
-    ConfigBuilder("spark.eventLog.logStageExecutorMetrics.enabled")
+    ConfigBuilder("spark.eventLog.logStageExecutorMetrics")
       .doc("Whether to write per-stage peaks of executor metrics (for each executor) " +
         "to the event log.")
       .booleanConf
@@ -632,7 +632,7 @@ package object config {
       .createWithDefault(128)
 
   private[spark] val LISTENER_BUS_LOG_SLOW_EVENT_ENABLED =
-    ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.enabled")
+    ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent")
       .internal()
       .doc("When enabled, log the event that takes too much time to process. This helps us " +
         "discover the event types that cause performance bottlenecks. The time threshold is " +
@@ -644,7 +644,7 @@ package object config {
     ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.threshold")
       .internal()
       .doc("The time threshold of whether a event is considered to be taking too much time to " +
-        "process. Log the event if spark.scheduler.listenerbus.logSlowEvent.enabled is true.")
+        s"process. Log the event if ${LISTENER_BUS_LOG_SLOW_EVENT_ENABLED.key} is true.")
       .timeConf(TimeUnit.NANOSECONDS)
       .createWithDefaultString("1s")
 
@@ -1115,16 +1115,6 @@ package object config {
       .booleanConf
       .createWithDefault(false)
 
-  private[spark] val STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE =
-    ConfigBuilder("spark.storage.localDiskByExecutors.cacheSize")
-      .doc("The max number of executors for which the local dirs are stored. This size is " +
-        "both applied for the driver and both for the executors side to avoid having an " +
-        "unbounded store. This cache will be used to avoid the network in case of fetching disk " +
-        "persisted RDD blocks or shuffle blocks (when `spark.shuffle.readHostLocalDisk.enabled` " +
-        "is set) from the same host.")
-      .intConf
-      .createWithDefault(1000)
-
   private[spark] val SHUFFLE_SYNC =
     ConfigBuilder("spark.shuffle.sync")
       .doc("Whether to force outstanding writes to disk.")
@@ -1161,13 +1151,23 @@ package object config {
       .createWithDefault(false)
 
   private[spark] val SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED =
-    ConfigBuilder("spark.shuffle.readHostLocalDisk.enabled")
+    ConfigBuilder("spark.shuffle.readHostLocalDisk")
       .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled), shuffle " +
         "blocks requested from those block managers which are running on the same host are read " +
         "from the disk directly instead of being fetched as remote blocks over the network.")
       .booleanConf
       .createWithDefault(true)
 
+  private[spark] val STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE =
+    ConfigBuilder("spark.storage.localDiskByExecutors.cacheSize")
+      .doc("The max number of executors for which the local dirs are stored. This size is " +
+        "both applied for the driver and both for the executors side to avoid having an " +
+        "unbounded store. This cache will be used to avoid the network in case of fetching disk " +
+        s"persisted RDD blocks or shuffle blocks " +
+        s"(when `${SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED.key}` is set) from the same host.")
+      .intConf
+      .createWithDefault(1000)
+
   private[spark] val MEMORY_MAP_LIMIT_FOR_TESTS =
     ConfigBuilder("spark.storage.memoryMapLimitForTests")
       .internal()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 8c23388b37a3d..24e2a5e4d4a62 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -40,7 +40,7 @@ import org.apache.spark.util.{JsonProtocol, Utils}
  *   spark.eventLog.enabled - Whether event logging is enabled.
  *   spark.eventLog.dir - Path to the directory in which events are logged.
  *   spark.eventLog.logBlockUpdates.enabled - Whether to log block updates
- *   spark.eventLog.logStageExecutorMetrics.enabled - Whether to log stage executor metrics
+ *   spark.eventLog.logStageExecutorMetrics - Whether to log stage executor metrics
  *
  * Event log file writer maintains its own parameters: refer the doc of [[EventLogFileWriter]]
  * and its descendant for more details.
diff --git a/core/src/test/resources/spark-events/application_1553914137147_0018 b/core/src/test/resources/spark-events/application_1553914137147_0018
index 8c34e7265f8da..03ea3040de1db 100644
--- a/core/src/test/resources/spark-events/application_1553914137147_0018
+++ b/core/src/test/resources/spark-events/application_1553914137147_0018
@@ -1,6 +1,6 @@
 {"Event":"SparkListenerLogStart","Spark Version":"3.0.0-SNAPSHOT"}
 {"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"test-1.vpc.company.com","Port":44768},"Maximum Memory":956615884,"Timestamp":1554755989747,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0}
-{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/java/jdk1.8.0_144/jre","Java Version":"1.8.0_144 (Oracle Corporation)","Scala Version":"version 2.12.8"},"Spark Properties":{"spark.lineage.log.dir":"/var/log/spark2/lineage","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.yarn.jars":"local:/opt/cloudera/parcels/SPARK2/lib/spark2/jars/*","spark.executor.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.driver.host":"test-1.vpc.company.com","spark.eventLog.enabled":"true","spark.executor.heartbeatInterval":"1000","spark.executor.memoryOverhead":"1024","spark.driver.port":"34194","spark.shuffle.service.enabled":"false","spark.driver.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.lineage.enabled":"false","spark.jars":"file:/tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar","spark.executor.metrics.pollingInterval":"100","spark.yarn.historyServer.address":"http://test-1.vpc.company.com:18089","spark.ui.enabled":"true","spark.app.name":"LargeBlocks","spark.ui.killEnabled":"true","spark.sql.hive.metastore.jars":"${env:HADOOP_COMMON_HOME}/../hive/lib/*:${env:HADOOP_COMMON_HOME}/client/*","spark.locality.wait.process":"0","spark.dynamicAllocation.schedulerBacklogTimeout":"1","spark.yarn.am.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.scheduler.mode":"FIFO","spark.eventLog.logStageExecutorMetrics.enabled":"true","spark.driver.memory":"2g","spark.executor.instances":"3","spark.submit.pyFiles":"","spark.yarn.config.gatewayPath":"/opt/cloudera/parcels","spark.executor.id":"driver","spark.yarn.config.replacementPath":"{{HADOOP_COMMON_HOME}}/../../..","spark.driver.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.eventLog.logStageExecutorProcessTreeMetrics.enabled":"true","spark.submit.deployMode":"client","spark.shuffle.service.port":"7337","spark.yarn.maxAppAttempts":"1","spark.master":"yarn","spark.authenticate":"false","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.network.crypto.enabled":"false","spark.executor.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.executor.memory":"7g","spark.io.encryption.enabled":"false","spark.eventLog.dir":"hdfs://test-1.vpc.company.com:8020/user/spark/spark2ApplicationHistory","spark.dynamicAllocation.enabled":"false","spark.sql.catalogImplementation":"hive","spark.executor.cores":"1","spark.driver.appUIAddress":"http://test-1.vpc.company.com:4040","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"test-1.vpc.company.com","spark.dynamicAllocation.minExecutors":"0","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://test-1.vpc.company.com:8088/proxy/application_1553914137147_0018","spark.dynamicAllocation.executorIdleTimeout":"60","spark.app.id":"application_1553914137147_0018","spark.sql.hive.metastore.version":"1.1.0"},"Hadoop Properties":{"yarn.resourcemanager.amlauncher.thread-count":"50","dfs.namenode.resource.check.interval":"5000","fs.s3a.connection.maximum":"100","mapreduce.jobtracker.jobhistory.task.numberprogresssplits":"12","dfs.data.transfer.server.tcpnodelay":"true","mapreduce.tasktracker.healthchecker.script.timeout":"600000","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","dfs.cachereport.intervalMsec":"10000","dfs.namenode.checkpoint.txns":"1000000","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"30","mapreduce.tasktracker.local.dir.minspacekill":"0","hadoop.security.groups.cache.background.reload.threads":"3","dfs.namenode.lease-recheck-interval-ms":"2000","fs.s3.block.size":"67108864","dfs.client.domain.socket.data.traffic":"false","dfs.ha.zkfc.nn.http.timeout.ms":"20000","hadoop.registry.secure":"false","hadoop.hdfs.configuration.version":"1","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"${hadoop.tmp.dir}/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25","mapreduce.jobhistory.loadedjobs.cache.size":"5","mapreduce.jobtracker.persist.jobstatus.hours":"1","fs.s3a.s3guard.ddb.table.create":"false","dfs.datanode.slow.io.warning.threshold.ms":"300","dfs.namenode.handler.count":"10","dfs.namenode.list.reencryption.status.num.responses":"100","mapreduce.input.fileinputformat.split.minsize":"0","dfs.datanode.failed.volumes.tolerated":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","yarn.resourcemanager.amliveliness-monitor.interval-ms":"1000","yarn.resourcemanager.client.thread-count":"50","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"40","dfs.namenode.retrycache.expirytime.millis":"600000","dfs.namenode.backup.address":"0.0.0.0:50100","dfs.datanode.data.dir":"file://${hadoop.tmp.dir}/dfs/data","dfs.datanode.shared.file.descriptor.paths":"/dev/shm,/tmp","dfs.replication":"3","mapreduce.jobtracker.jobhistory.block.size":"3145728","dfs.encrypt.data.transfer.cipher.key.bitlength":"128","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","dfs.secondary.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","mapreduce.task.profile.maps":"0-2","dfs.datanode.block-pinning.enabled":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","mapreduce.jobtracker.retiredjobs.cache.size":"1000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"test-1.vpc.company.com:19890","fs.trash.checkpoint.interval":"0","dfs.namenode.checkpoint.check.period":"60","yarn.nodemanager.container-monitor.interval-ms":"3000","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","hadoop.proxyuser.HTTP.groups":"*","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"10000","dfs.namenode.reencrypt.edek.threads":"10","yarn.resourcemanager.system-metrics-publisher.enabled":"false","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","dfs.namenode.fs-limits.max-component-length":"255","mapreduce.tasktracker.taskcontroller":"org.apache.hadoop.mapred.DefaultTaskController","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","dfs.namenode.top.window.num.buckets":"10","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","dfs.datanode.block.id.layout.upgrade.threads":"12","mapreduce.jobtracker.tasktracker.maxblacklists":"4","yarn.nodemanager.docker-container-executor.exec-name":"/usr/bin/docker","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"1000","hadoop.common.configuration.version":"0.23.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","dfs.namenode.decommission.max.concurrent.tracked.nodes":"100","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","dfs.client.read.shortcircuit.skip.checksum":"false","mapreduce.task.profile.reduces":"0-2","dfs.datanode.address":"0.0.0.0:50010","dfs.https.server.keystore.resource":"ssl-server.xml","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"test-1.vpc.company.com:8030","mapreduce.task.timeout":"600000","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://test-1.vpc.company.com:8020","fs.har.impl.disable.cache":"true","io.compression.codec.bzip2.library":"system-native","dfs.namenode.audit.loggers":"default","dfs.block.access.key.update.interval":"600","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"test-1.vpc.company.com:8090","dfs.namenode.max.objects":"0","mapreduce.jobhistory.address":"test-1.vpc.company.com:10020","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:0","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","dfs.namenode.max.full.block.report.leases":"6","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","dfs.client.mmap.cache.timeout.ms":"3600000","dfs.mover.max-no-move-interval":"60000","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","dfs.client.datanode-restart.timeout":"30","dfs.datanode.drop.cache.behind.reads":"false","ipc.server.log.slow.rpc":"false","dfs.namenode.read-lock-reporting-threshold-ms":"5000","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.nodemanager.default-container-executor.log-dirs.permissions":"710","dfs.namenode.checkpoint.edits.dir":"${dfs.namenode.checkpoint.dir}","yarn.app.attempt.diagnostics.limit.kc":"64","dfs.balancer.block-move.timeout":"0","dfs.client.block.write.replace-datanode-on-failure.enable":"true","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"8192","io.compression.codecs":"org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec","fs.s3a.fast.upload.active.blocks":"4","hadoop.security.credential.clear-text-fallback":"true","dfs.heartbeat.interval":"3","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","dfs.ha.tail-edits.period":"60","dfs.datanode.max.locked.memory":"0","dfs.datanode.scan.period.hours":"504","mapreduce.jobtracker.expire.trackers.interval":"600000","yarn.resourcemanager.nodemanager-connect-retries":"10","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"false","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","dfs.namenode.edits.noeditlogchannelflush":"false","mapreduce.task.io.sort.factor":"64","mapreduce.tasktracker.outofband.heartbeat":"false","ha.failover-controller.new-active.rpc-timeout.ms":"60000","dfs.webhdfs.ugi.expire.after.access":"600000","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"test-1.vpc.company.com:20102","yarn.am.blacklisting.enabled":"true","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","dfs.image.transfer-bootstrap-standby.bandwidthPerSec":"0","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"20","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.proxyuser.hive.groups":"*","ha.zookeeper.session-timeout.ms":"5000","mapreduce.map.java.opts":"-Djava.net.preferIPv4Stack=true","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","dfs.ha.automatic-failover.enabled":"false","mapreduce.job.counters.groups.max":"50","dfs.namenode.decommission.interval":"30","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","dfs.datanode.handler.count":"10","dfs.namenode.xattrs.enabled":"true","dfs.namenode.safemode.threshold-pct":"0.999f","dfs.client.socket.send.buffer.size":"0","mapreduce.map.sort.spill.percent":"0.8","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","dfs.datanode.sync.behind.writes":"false","dfs.namenode.stale.datanode.interval":"30000","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","dfs.datanode.transfer.socket.send.buffer.size":"0","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","dfs.namenode.reencrypt.throttle.limit.handler.ratio":"1.0","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler","yarn.app.mapreduce.am.command-opts":"-Djava.net.preferIPv4Stack=true -Xmx825955249","dfs.journalnode.https-address":"0.0.0.0:8481","mapreduce.cluster.local.dir":"${hadoop.tmp.dir}/mapred/local","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","dfs.user.home.dir.prefix":"/user","hadoop.proxyuser.hue.groups":"*","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","mapreduce.jobtracker.persist.jobstatus.dir":"/jobtracker/jobsInfo","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","dfs.namenode.snapshot.capture.openfiles":"false","dfs.datanode.directoryscan.interval":"21600","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","dfs.client.read.shortcircuit.streams.cache.expiry.ms":"300000","fs.s3a.connection.timeout":"200000","mapreduce.job.max.split.locations":"10","dfs.namenode.write.stale.datanode.ratio":"0.5f","hadoop.registry.zk.session.timeout.ms":"60000","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","mapreduce.jobtracker.taskcache.levels":"2","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","mapreduce.job.emit-timeline-data":"false","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","dfs.domain.socket.path":"/var/run/hdfs-sockets/dn","yarn.resourcemanager.admin.client.thread-count":"1","mapreduce.jobtracker.persist.jobstatus.active":"true","dfs.namenode.reencrypt.sleep.interval":"1m","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"test-1.vpc.company.com:10033","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","mapreduce.tasktracker.report.address":"127.0.0.1:0","dfs.namenode.edit.log.autoroll.multiplier.threshold":"2.0","io.seqfile.lazydecompress":"true","ftp.blocksize":"67108864","dfs.namenode.backup.http-address":"0.0.0.0:50105","dfs.disk.balancer.max.disk.throughputInMBperSec":"10","mapreduce.jobtracker.instrumentation":"org.apache.hadoop.mapred.JobTrackerMetricsInst","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.delete.debug-delay-sec":"0","mapreduce.jobtracker.http.address":"0.0.0.0:50030","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","hadoop.security.groups.cache.secs":"300","yarn.resourcemanager.zk-retry-interval-ms":"1000","yarn.scheduler.increment-allocation-mb":"512","nfs.mountd.port":"4242","mapreduce.shuffle.max.threads":"0","hadoop.security.authorization":"false","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","mapreduce.jobhistory.http.policy":"HTTP_ONLY","dfs.datanode.dns.interface":"default","mapreduce.reduce.java.opts":"-Djava.net.preferIPv4Stack=true","s3native.replication":"3","hadoop.security.group.mapping.ldap.ssl":"false","dfs.namenode.fs-limits.max-xattrs-per-inode":"32","yarn.client.application-client-protocol.poll-interval-ms":"200","hadoop.proxyuser.flume.groups":"*","dfs.namenode.fs-limits.max-xattr-size":"16384","dfs.namenode.maintenance.replication.min":"1","dfs.client.write.exclude.nodes.cache.expiry.interval.millis":"600000","ha.zookeeper.parent-znode":"/hadoop-ha","dfs.namenode.safemode.extension":"30000","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","dfs.blocksize":"134217728","dfs.namenode.servicerpc-address":"test-1.vpc.company.com:8022","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"test-1.vpc.company.com:8032","ipc.client.ping":"true","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","dfs.namenode.resource.checked.volumes.minimum":"1","hadoop.proxyuser.HTTP.hosts":"*","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","dfs.namenode.quota.init-threads":"4","dfs.journalnode.http-address":"0.0.0.0:8480","dfs.block.scanner.volume.bytes.per.second":"1048576","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","dfs.storage.policy.enabled":"true","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","dfs.namenode.edits.dir":"${dfs.namenode.name.dir}","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","dfs.namenode.support.allow.format":"true","dfs.ha.tail-edits.rolledits.timeout":"60","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","mapreduce.cluster.temp.dir":"${hadoop.tmp.dir}/mapred/temp","s3.replication":"3","dfs.client.failover.connection.retries":"0","hadoop.tmp.dir":"/tmp/hadoop-${user.name}","mapreduce.job.maps":"2","dfs.namenode.secondary.http-address":"0.0.0.0:50090","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"50","nfs.wtmax":"1048576","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","nfs.dump.dir":"/tmp/.hdfs-nfs","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","mapreduce.client.submit.file.replication":"3","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"128M","dfs.namenode.service.handler.count":"10","dfs.datanode.data.dir.perm":"700","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"file:///dataroot/dataroot/dfs/nn","yarn.resourcemanager.zk-acl":"world:anyone:rwcda","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","mapreduce.reduce.input.buffer.percent":"0.0","fs.ftp.host.port":"21","ipc.ping.interval":"60000","dfs.namenode.num.checkpoints.retained":"2","dfs.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","yarn.resourcemanager.admin.address":"test-1.vpc.company.com:8033","file.client-write-packet-size":"65536","hadoop.treat.subject.external":"false","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"false","dfs.disk.balancer.plan.threshold.percent":"10","mapreduce.local.clientfactory.class.name":"org.apache.hadoop.mapred.LocalClientFactory","dfs.client.use.legacy.blockreader":"false","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"256","yarn.nodemanager.localizer.client.thread-count":"5","dfs.namenode.checkpoint.max-retries":"3","dfs.namenode.reject-unresolved-dn-topology-mapping":"false","dfs.namenode.delegation.token.max-lifetime":"*********(redacted)","dfs.client.block.write.replace-datanode-on-failure.min-replication":"0","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec,org.apache.hadoop.crypto.JceAesCtrCryptoCodec","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"2000","dfs.namenode.num.extra.edits.retained":"1000000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","yarn.scheduler.increment-allocation-vcores":"1","fs.df.interval":"60000","fs.s3.sleepTimeSeconds":"10","fs.s3a.multiobjectdelete.enable":"true","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"true","io.file.buffer.size":"65536","hadoop.work.around.non.threadsafe.getpwuid":"false","dfs.permissions.superusergroup":"supergroup","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","mapreduce.tasktracker.dns.interface":"default","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","dfs.permissions.enabled":"true","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.scheduler.minimum-allocation-mb":"1024","yarn.app.mapreduce.am.staging-dir":"/user","mapreduce.reduce.shuffle.read.timeout":"180000","yarn.app.mapreduce.am.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","dfs.datanode.https.address":"0.0.0.0:50475","dfs.namenode.hosts.provider.classname":"org.apache.hadoop.hdfs.server.blockmanagement.HostFileManager","dfs.datanode.transfer.socket.recv.buffer.size":"0","fs.s3a.connection.establish.timeout":"5000","dfs.namenode.fslock.fair":"true","mapreduce.job.running.map.limit":"0","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","hadoop.fuse.connection.timeout":"300","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"5000","yarn.nodemanager.vmem-pmem-ratio":"2.1","dfs.client.slow.io.warning.threshold.ms":"30000","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","s3native.stream-buffer-size":"4096","yarn.nodemanager.remote-app-log-dir":"/tmp/logs","fs.s3a.s3guard.cli.prune.age":"86400000","dfs.client.read.shortcircuit.streams.cache.size":"256","dfs.client.use.legacy.blockreader.local":"false","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3n.multipart.uploads.enabled":"false","dfs.namenode.path.based.cache.retry.interval.ms":"30000","hadoop.security.crypto.buffer.size":"8192","yarn.client.failover-retries-on-socket-timeouts":"0","dfs.balancer.keytab.enabled":"false","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","dfs.datanode.balance.bandwidthPerSec":"10485760","dfs.namenode.name.dir.restore":"false","hadoop.registry.jaas.context":"Client","dfs.client.failover.sleep.max.millis":"15000","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","s3.blocksize":"67108864","yarn.am.blacklisting.disable-failure-threshold":"0.8f","io.map.index.interval":"128","mapreduce.job.counters.max":"120","dfs.namenode.max-lock-hold-to-release-lease-ms":"25","dfs.namenode.datanode.registration.ip-hostname-check":"true","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","dfs.namenode.resource.du.reserved":"104857600","dfs.datanode.bp-ready.timeout":"20","yarn.nodemanager.localizer.fetch.thread-count":"4","yarn.resourcemanager.scheduler.client.thread-count":"50","hadoop.ssl.hostname.verifier":"DEFAULT","dfs.namenode.full.block.report.lease.length.ms":"300000","mapreduce.tasktracker.instrumentation":"org.apache.hadoop.mapred.TaskTrackerMetricsInst","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","s3.stream-buffer-size":"4096","yarn.resourcemanager.nm.liveness-monitor.interval-ms":"1000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","dfs.namenode.secondary.https-address":"0.0.0.0:50091","s3native.bytes-per-checksum":"512","dfs.namenode.fs-limits.max-directory-items":"1048576","nfs.server.port":"2049","dfs.namenode.delegation.token.renew-interval":"*********(redacted)","mapreduce.jobtracker.address":"local","yarn.nodemanager.recovery.enabled":"false","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","dfs.namenode.list.openfiles.num.responses":"1000","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","dfs.namenode.blocks.per.postponedblocks.rescan":"10000","dfs.namenode.checkpoint.period":"3600","hadoop.security.groups.cache.background.reload":"false","yarn.resourcemanager.amlauncher.log.command":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","dfs.namenode.snapshot.skip.capture.accesstime-only-change":"false","dfs.webhdfs.user.provider.user.pattern":"^[A-Za-z_][A-Za-z0-9._-]*[$]?$","dfs.webhdfs.acl.provider.permission.pattern":"^(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?(,(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?)*$","mapreduce.fileoutputcommitter.algorithm.version":"1","yarn.resourcemanager.work-preserving-recovery.enabled":"false","mapreduce.map.skip.maxrecords":"0","mapreduce.jobtracker.handler.count":"10","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"1","hadoop.proxyuser.flume.hosts":"*","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","fs.s3n.block.size":"67108864","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","mapreduce.jobtracker.staging.root.dir":"${hadoop.tmp.dir}/mapred/staging","dfs.namenode.http-address":"test-1.vpc.company.com:20101","mapreduce.jobtracker.jobhistory.lru.cache.size":"5","dfs.datanode.directoryscan.threads":"1","dfs.datanode.fsdatasetcache.max.threads.per.volume":"4","dfs.namenode.fs-limits.max-blocks-per-file":"1048576","dfs.disk.balancer.enabled":"false","mapreduce.shuffle.listen.queue.size":"128","mapreduce.tasktracker.local.dir.minspacestart":"0","mapreduce.map.cpu.vcores":"1","hadoop.user.group.static.mapping.overrides":"dr.who=;","dfs.datanode.cache.revocation.timeout.ms":"900000","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","dfs.client.mmap.cache.size":"256","dfs.ha.log-roll.period":"120","dfs.client.failover.sleep.base.millis":"500","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","dfs.namenode.accesstime.precision":"3600000","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","hadoop.registry.zk.retry.interval.ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","dfs.client.context":"default","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"localhost:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","hadoop.ssl.server.conf":"ssl-server.xml","dfs.http.policy":"HTTP_ONLY","dfs.client.https.keystore.resource":"ssl-client.xml","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.acl.enable":"true","dfs.domain.socket.disable.interval.seconds":"1","dfs.image.transfer.chunksize":"65536","dfs.balancer.max-no-move-interval":"60000","mapreduce.tasktracker.map.tasks.maximum":"2","dfs.namenode.edits.journal-plugin.qjournal":"org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"nobody","dfs.namenode.list.encryption.zones.num.responses":"100","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","dfs.namenode.top.num.users":"10","dfs.disk.balancer.block.tolerance.percent":"10","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"${yarn.log.dir}/userlogs","io.seqfile.sorter.recordlimit":"1000000","hadoop.security.auth_to_local":"DEFAULT","dfs.blockreport.initialDelay":"0","fs.automatic.close":"true","dfs.client.block.write.replace-datanode-on-failure.best-effort":"false","dfs.namenode.replication.min":"1","dfs.balancer.address":"0.0.0.0:0","fs.s3n.multipart.copy.block.size":"5368709120","yarn.nodemanager.hostname":"0.0.0.0","nfs.rtmax":"1048576","yarn.resourcemanager.zk-timeout-ms":"10000","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","dfs.datanode.directoryscan.throttle.limit.ms.per.sec":"1000","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"2","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.namenode.reencrypt.throttle.limit.updater.ratio":"1.0","dfs.namenode.avoid.write.stale.datanode":"false","dfs.short.circuit.shared.memory.watcher.interrupt.check.ms":"60000","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"0.75f","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","dfs.client.mmap.enabled":"true","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","fs.client.resolve.remote.symlinks":"true","dfs.image.compression.codec":"org.apache.hadoop.io.compress.DefaultCodec","mapreduce.jobtracker.restart.recover":"false","dfs.namenode.decommission.blocks.per.interval":"500000","mapreduce.tasktracker.reduce.tasks.maximum":"2","yarn.nodemanager.local-dirs":"${hadoop.tmp.dir}/nm-local-dir","mapreduce.shuffle.connection-keep-alive.enable":"false","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","dfs.namenode.safemode.min.datanodes":"0","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"24","yarn.scheduler.maximum-allocation-vcores":"4","yarn.nodemanager.log-aggregation.compression-type":"none","dfs.namenode.enable.retrycache":"true","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","dfs.namenode.startup.delay.block.deletion.sec":"0","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","dfs.datanode.readahead.bytes":"4194304","mapreduce.jobtracker.heartbeats.in.second":"100","mapreduce.job.running.reduce.limit":"0","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","mapreduce.task.tmp.dir":"./tmp","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:mapred@hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","dfs.namenode.delegation.key.update-interval":"86400000","fs.s3a.max.total.tasks":"5","dfs.client.file-block-storage-locations.num-threads":"10","mapreduce.tasktracker.healthchecker.interval":"60000","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.fast.upload":"false","fs.s3a.attempts.maximum":"20","dfs.namenode.avoid.read.stale.datanode":"false","hadoop.registry.zk.connection.timeout.ms":"15000","dfs.https.port":"20102","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","mapreduce.map.log.level":"INFO","mapreduce.output.fileoutputformat.compress.type":"BLOCK","hadoop.registry.rm.enabled":"false","mapreduce.ifile.readahead.bytes":"4194304","mapreduce.tasktracker.tasks.sleeptimebeforesigkill":"5000","yarn.resourcemanager.fs.state-store.retry-policy-spec":"2000, 500","dfs.namenode.posix.acl.inheritance.enabled":"false","dfs.blockreport.intervalMsec":"21600000","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","dfs.namenode.path.based.cache.refresh.interval.ms":"30000","dfs.namenode.edekcacheloader.interval.ms":"1000","file.stream-buffer-size":"4096","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"false","dfs.disk.balancer.max.disk.errors":"5","dfs.datanode.use.datanode.hostname":"false","mapreduce.job.speculative.retry-after-speculate":"15000","hadoop.proxyuser.hdfs.hosts":"*","dfs.namenode.fs-limits.min-block-size":"1048576","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","mapreduce.job.reduce.slowstart.completedmaps":"0.8","dfs.client.read.shortcircuit":"false","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","hadoop.proxyuser.yarn.groups":"*","dfs.client.cached.conn.retry":"3","dfs.namenode.invalidate.work.pct.per.iteration":"0.32f","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.nodemanager.logaggregation.threadpool-size-max":"100","dfs.replication.max":"512","dfs.namenode.inotify.max.events.per.rpc":"1000","yarn.resourcemanager.hostname":"0.0.0.0","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"0","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","ipc.client.rpc-timeout.ms":"0","fs.s3.maxRetries":"4","dfs.default.chunk.view.size":"32768","mapreduce.input.lineinputformat.linespermap":"1","ipc.client.connect.max.retries.on.timeouts":"45","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","dfs.client.mmap.retry.timeout.ms":"300000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","dfs.namenode.list.cache.directives.num.responses":"100","fs.s3a.socket.recv.buffer":"8192","dfs.image.compress":"false","dfs.namenode.kerberos.principal.pattern":"*","yarn.application.classpath":"$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*","fs.s3n.multipart.uploads.block.size":"67108864","mapreduce.tasktracker.http.address":"0.0.0.0:50060","yarn.resourcemanager.resource-tracker.address":"test-1.vpc.company.com:8031","hadoop.fuse.timer.period":"5","mapreduce.job.heap.memory-mb.ratio":"0.8","dfs.datanode.hdfs-blocks-metadata.enabled":"true","dfs.namenode.checkpoint.dir":"file://${hadoop.tmp.dir}/dfs/namesecondary","dfs.datanode.max.transfer.threads":"4096","dfs.namenode.edits.asynclogging":"true","nfs.allow.insecure.ports":"true","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"0","s3native.client-write-packet-size":"65536","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","yarn.timeline-service.hostname":"0.0.0.0","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","hadoop.proxyuser.mapred.hosts":"*","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","hadoop.proxyuser.mapred.groups":"*","yarn.resourcemanager.keytab":"/etc/krb5.keytab","mapreduce.reduce.merge.inmem.threshold":"1000","dfs.client.https.need-auth":"false","dfs.blockreport.split.threshold":"1000000","dfs.client.block.write.replace-datanode-on-failure.policy":"DEFAULT","mapreduce.shuffle.ssl.enabled":"false","dfs.namenode.write-lock-reporting-threshold-ms":"5000","dfs.block.access.token.enable":"*********(redacted)","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","httpfs.buffer.size":"4096","dfs.client.file-block-storage-locations.timeout.millis":"1000","dfs.namenode.block-placement-policy.default.prefer-local-node":"true","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.log-aggregation.retain-seconds":"-1","dfs.namenode.replication.considerLoad":"true","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.proxyuser.hdfs.groups":"*","dfs.namenode.retrycache.heap.percent":"0.03f","dfs.datanode.cache.revocation.polling.ms":"500","mapreduce.jobhistory.webapp.address":"test-1.vpc.company.com:19888","dfs.namenode.path.based.cache.block.map.allocation.percent":"0.25","mapreduce.jobtracker.system.dir":"${hadoop.tmp.dir}/mapred/system","mapreduce.tasktracker.taskmemorymanager.monitoringinterval":"5000","dfs.journalnode.rpc-address":"0.0.0.0:8485","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"test-1.vpc.company.com:8088","mapreduce.jobhistory.recovery.enable":"false","dfs.client.short.circuit.replica.stale.threshold.ms":"1800000","mapreduce.reduce.shuffle.parallelcopies":"10","fs.trash.interval":"1","dfs.namenode.replication.interval":"3","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","dfs.namenode.top.enabled":"true","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"0","yarn.app.mapreduce.am.resource.mb":"1024","mapreduce.input.fileinputformat.list-status.num-threads":"1","dfs.namenode.lazypersist.file.scrub.interval.sec":"300","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"6","fs.s3a.multipart.size":"64M","yarn.scheduler.minimum-allocation-vcores":"1","dfs.namenode.reencrypt.batch.size":"1000","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","dfs.datanode.http.address":"0.0.0.0:50075","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","fs.s3a.metadatastore.authoritative":"false","ha.health-monitor.sleep-after-disconnect.ms":"1000","s3.bytes-per-checksum":"512","yarn.app.mapreduce.shuffle.log.limit.kb":"0","dfs.namenode.list.cache.pools.num.responses":"100","hadoop.security.group.mapping":"org.apache.hadoop.security.ShellBasedUnixGroupsMapping","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","dfs.encrypt.data.transfer":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","mapreduce.job.redacted-properties":"*********(redacted)","dfs.namenode.top.windows.minutes":"1,5,25","s3.client-write-packet-size":"65536","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","ipc.server.max.connections":"0","yarn.scheduler.maximum-allocation-mb":"12288","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.app.mapreduce.am.container.log.limit.kb":"0","s3native.blocksize":"67108864","ipc.client.connect.retry.interval":"1000","hadoop.proxyuser.httpfs.groups":"*","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","dfs.namenode.edit.log.autoroll.check.interval.ms":"300000","mapreduce.jobhistory.cleaner.enable":"true","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","dfs.client.use.datanode.hostname":"false","dfs.stream-buffer-size":"4096","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","dfs.datanode.drop.cache.behind.writes":"false","mapreduce.tasktracker.dns.nameserver":"default","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","mapreduce.job.end-notification.retry.attempts":"0","hadoop.proxyuser.yarn.hosts":"*","yarn.resourcemanager.zk-num-retries":"1000","dfs.client.failover.max.attempts":"15","mapreduce.tasktracker.indexcache.mb":"10","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","dfs.namenode.snapshotdiff.allow.snap-root-descendant":"true","yarn.nodemanager.localizer.cache.target-size-mb":"10240","zlib.compress.level":"DEFAULT_COMPRESSION","ftp.client-write-packet-size":"65536","mapreduce.jobtracker.maxtasks.perjob":"-1","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","dfs.block.access.token.lifetime":"*********(redacted)","dfs.namenode.max.extra.edits.segments.retained":"10000","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","dfs.image.transfer.bandwidthPerSec":"0","io.native.lib.available":"true","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","ipc.server.listen.queue.size":"128","dfs.namenode.edekcacheloader.initial.delay.ms":"3000","map.sort.class":"org.apache.hadoop.util.QuickSort","dfs.namenode.acls.enabled":"false","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","dfs.datanode.ipc.address":"0.0.0.0:50020","yarn.nodemanager.vmem-check-enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","dfs.lock.suppress.warning.interval":"10s","dfs.client.block.write.retries":"3","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.application.classpath":"$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH","yarn.client.nodemanager-connect.retry-interval-ms":"10000","dfs.client-write-packet-size":"65536","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME","dfs.datanode.dns.nameserver":"default","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","dfs.image.transfer.timeout":"60000","yarn.resourcemanager.recovery.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240","dfs.client.failover.connection.retries.on.timeouts":"0"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.security.egd":"file:///dev/urandom","java.vm.specification.version":"1.8","user.home":"/home/systest","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/java/jdk1.8.0_144/jre/lib/amd64","user.dir":"/tmp","java.library.path":":/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.144-b01","jetty.git.hash":"27208684755d94a92186989f695db2d7b21ebc51","java.endorsed.dirs":"/usr/java/jdk1.8.0_144/jre/lib/endorsed","java.runtime.version":"1.8.0_144-b01","java.vm.info":"mixed mode","java.ext.dirs":"/usr/java/jdk1.8.0_144/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/java/jdk1.8.0_144/jre/lib/resources.jar:/usr/java/jdk1.8.0_144/jre/lib/rt.jar:/usr/java/jdk1.8.0_144/jre/lib/sunrsasign.jar:/usr/java/jdk1.8.0_144/jre/lib/jsse.jar:/usr/java/jdk1.8.0_144/jre/lib/jce.jar:/usr/java/jdk1.8.0_144/jre/lib/charsets.jar:/usr/java/jdk1.8.0_144/jre/lib/jfr.jar:/usr/java/jdk1.8.0_144/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Los_Angeles","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"3.10.0-514.26.2.el7.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"systest","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode client --conf spark.driver.memory=2g --conf spark.executor.heartbeatInterval=1000 --conf spark.executor.metrics.pollingInterval=100 --conf spark.eventLog.logStageExecutorProcessTreeMetrics.enabled=true --conf spark.yarn.maxAppAttempts=1 --conf spark.locality.wait.process=0 --conf spark.executor.memoryOverhead=1024 --conf spark.executor.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --conf spark.eventLog.logStageExecutorMetrics.enabled=true --conf spark.driver.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --class com.company.spark.LargeBlocks --num-executors 3 --executor-memory 7g /tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar --targetBlockSizeGb 2.5 --taskSleepMillis 200 --doCache true --cacheOnDisk true --replicas 1 --concurrentReadJobs 2","java.home":"/usr/java/jdk1.8.0_144/jre","java.version":"1.8.0_144","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/opt/cloudera/parcels/CDH/jars/jackson-mapper-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/joni-2.1.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-external-blockcache-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-json-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/xz-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-annotations-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/cglib-2.2.1-v20090111.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-procedure-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-jackson-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/asm-3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xml-apis-1.3.04.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-jvm-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-dbcp-1.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-streaming_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-xml-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/minlog-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hsqldb-1.8.0.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pmml-model-1.4.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire-macros_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zstd-jni-1.3.2-2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpcore-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jta-1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-yarn_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-logging-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/activation-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xbean-asm7-shaded-4.12.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/NOTICE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-format-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpclient-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-3.9.9.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/htrace-core-3.1.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-nativetask-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-client-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-xc-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-configuration-1.6.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math3-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsp-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-auth-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-registry-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/JavaEWAH-0.3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-graphite-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/joda-time-2.9.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-gridmix-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-compress-1.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javolution-5.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-1.7.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/core-1.1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-runtime-3.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-net-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-framework-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-java-1.1.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-assembly_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-examples-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/shapeless_2.12-2.3.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xercesImpl-2.9.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-server-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang-2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jtransforms-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-0.2.jar":"System Classpath","/etc/spark2/conf/yarn-conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-cli-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.annotation-api-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0-2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javassist-3.18.1-GA.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-applicationhistoryservice-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka-clients-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guice-3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/core-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/opencsv-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-datajoin-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-common-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jdo-api-3.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-webapp-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compiler-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-annotations-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/libthrift-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka_2.11-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-proxy-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-2.7.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jettison-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libfb303-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/zookeeper-3.4.5-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-resource-bundle-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jaxb-api-2.2.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-streaming-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop2-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pyrolite-4.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/activation-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aircompressor-0.10.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-ant-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jline-2.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-all-4.0.23.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlets-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-xml_2.12-1.0.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-graphx_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-plugins-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-web-proxy-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-kvstore_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/objenesis-2.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stream-2.7.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-aws-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-scala_2.12-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/spymemcached-2.11.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-util-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-core-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/mockito-all-1.8.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jets3t-0.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-xc-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rsgroup-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-metastore-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/py4j-0.10.8.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-distributedshell-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-rumen-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsch-0.1.42.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/zkclient-0.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xmlenc-0.52.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-io-2.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-core-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jamon-runtime-2.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-common-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-common-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-3.10.5.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-app-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/machinist_2.12-0.6.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-shuffle_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill_2.12-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/findbugs-annotations-1.3.9-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rest-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ST4-4.0.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jruby-cloudera-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-hive_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/leveldbjni-all-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-core_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-distcp-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-server-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-mapreduce-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-digester-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-http-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-jackson_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-math3-3.4.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-log4j12-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-jobclient-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-shell-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-server-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill-java-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-parser-combinators_2.12-1.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-api-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/log4j-1.2.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-compiler-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-security-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-shuffle-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-all-4.1.30.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze-macros_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jodd-core-3.5.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/azure-data-lake-store-sdk-2.2.9.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-12.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-codec-1.10.jar":"System Classpath","/usr/java/jdk1.8.0_144/lib/tools.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-nodemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sql_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-exec-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jcodings-1.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-datalake-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-11.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-catalyst_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-io-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-unmanaged-am-launcher-2.6.0-cdh5.15.2.jar":"System Classpath","/etc/spark2/conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpclient-4.5.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-continuation-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-ast_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-jndi-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsr305-3.0.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/avro-1.7.6-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/univocity-parsers-2.7.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-annotations-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jettison-1.3.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hppc-0.7.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/logredactor-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-vector-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-impl-2.2.3-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archives-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-plus-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-client-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hue-plugins-3.9.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/high-scale-lib-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-collections-3.2.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-library-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/java-xmlbuilder-0.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apache-log4j-extras-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-format-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/disruptor-3.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib-local_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-pool-1.5.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlet-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arpack_combined_all-0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-annotations-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/log4j-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-repl_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/derby-10.12.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-shims-1.5.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-core-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-daemon-1.0.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ivy-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hamcrest-core-1.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang3-3.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/spark-streaming-kafka-0-8_2.11-2.4.0.cloudera1-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-databind-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/kryo-shaded-4.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-reflect-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-examples-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpcore-4.4.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-memory-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/LICENSE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compress-1.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-client-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/oro-2.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-thrift-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-ipc-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-httpclient-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-encoding-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okhttp-2.4.0.jar":"System Classpath","spark://test-1.vpc.company.com:34194/jars/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar":"Added By User","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-compiler-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-prefix-tree-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-api-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/lz4-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/janino-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libthrift-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-launcher_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-core_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/lz4-java-1.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-jaxrs-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-common_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-unsafe_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-sls-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/snappy-java-1.0.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-guava-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/validation-api-1.1.0.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/microsoft-windowsazure-storage-sdk-0.6.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/compress-lzf-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-openstack-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-client-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-it-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-extras-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-recipes-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-column-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/unused-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-crypto-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-databind-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core4-4.0.1-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guava-14.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/aws-java-sdk-bundle-1.11.134.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zookeeper-3.4.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-paranamer-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-protocol-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-runtime-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-scalap_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-logging-1.1.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-resourcemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archive-logs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/paranamer-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-beanutils-1.9.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stringtemplate-3.2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/gson-2.2.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okio-1.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-api-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsr305-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sketch_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-el-1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-tags_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-core-3.2.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/protobuf-java-2.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-bundle-1.6.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xz-1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-client-2.6.0-cdh5.15.2.jar":"System Classpath"}}
+{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/java/jdk1.8.0_144/jre","Java Version":"1.8.0_144 (Oracle Corporation)","Scala Version":"version 2.12.8"},"Spark Properties":{"spark.lineage.log.dir":"/var/log/spark2/lineage","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.yarn.jars":"local:/opt/cloudera/parcels/SPARK2/lib/spark2/jars/*","spark.executor.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.driver.host":"test-1.vpc.company.com","spark.eventLog.enabled":"true","spark.executor.heartbeatInterval":"1000","spark.executor.memoryOverhead":"1024","spark.driver.port":"34194","spark.shuffle.service.enabled":"false","spark.driver.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.lineage.enabled":"false","spark.jars":"file:/tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar","spark.executor.metrics.pollingInterval":"100","spark.yarn.historyServer.address":"http://test-1.vpc.company.com:18089","spark.ui.enabled":"true","spark.app.name":"LargeBlocks","spark.ui.killEnabled":"true","spark.sql.hive.metastore.jars":"${env:HADOOP_COMMON_HOME}/../hive/lib/*:${env:HADOOP_COMMON_HOME}/client/*","spark.locality.wait.process":"0","spark.dynamicAllocation.schedulerBacklogTimeout":"1","spark.yarn.am.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.scheduler.mode":"FIFO","spark.eventLog.logStageExecutorMetrics":"true","spark.driver.memory":"2g","spark.executor.instances":"3","spark.submit.pyFiles":"","spark.yarn.config.gatewayPath":"/opt/cloudera/parcels","spark.executor.id":"driver","spark.yarn.config.replacementPath":"{{HADOOP_COMMON_HOME}}/../../..","spark.driver.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.eventLog.logStageExecutorProcessTreeMetrics.enabled":"true","spark.submit.deployMode":"client","spark.shuffle.service.port":"7337","spark.yarn.maxAppAttempts":"1","spark.master":"yarn","spark.authenticate":"false","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.network.crypto.enabled":"false","spark.executor.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.executor.memory":"7g","spark.io.encryption.enabled":"false","spark.eventLog.dir":"hdfs://test-1.vpc.company.com:8020/user/spark/spark2ApplicationHistory","spark.dynamicAllocation.enabled":"false","spark.sql.catalogImplementation":"hive","spark.executor.cores":"1","spark.driver.appUIAddress":"http://test-1.vpc.company.com:4040","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"test-1.vpc.company.com","spark.dynamicAllocation.minExecutors":"0","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://test-1.vpc.company.com:8088/proxy/application_1553914137147_0018","spark.dynamicAllocation.executorIdleTimeout":"60","spark.app.id":"application_1553914137147_0018","spark.sql.hive.metastore.version":"1.1.0"},"Hadoop Properties":{"yarn.resourcemanager.amlauncher.thread-count":"50","dfs.namenode.resource.check.interval":"5000","fs.s3a.connection.maximum":"100","mapreduce.jobtracker.jobhistory.task.numberprogresssplits":"12","dfs.data.transfer.server.tcpnodelay":"true","mapreduce.tasktracker.healthchecker.script.timeout":"600000","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","dfs.cachereport.intervalMsec":"10000","dfs.namenode.checkpoint.txns":"1000000","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"30","mapreduce.tasktracker.local.dir.minspacekill":"0","hadoop.security.groups.cache.background.reload.threads":"3","dfs.namenode.lease-recheck-interval-ms":"2000","fs.s3.block.size":"67108864","dfs.client.domain.socket.data.traffic":"false","dfs.ha.zkfc.nn.http.timeout.ms":"20000","hadoop.registry.secure":"false","hadoop.hdfs.configuration.version":"1","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"${hadoop.tmp.dir}/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25","mapreduce.jobhistory.loadedjobs.cache.size":"5","mapreduce.jobtracker.persist.jobstatus.hours":"1","fs.s3a.s3guard.ddb.table.create":"false","dfs.datanode.slow.io.warning.threshold.ms":"300","dfs.namenode.handler.count":"10","dfs.namenode.list.reencryption.status.num.responses":"100","mapreduce.input.fileinputformat.split.minsize":"0","dfs.datanode.failed.volumes.tolerated":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","yarn.resourcemanager.amliveliness-monitor.interval-ms":"1000","yarn.resourcemanager.client.thread-count":"50","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"40","dfs.namenode.retrycache.expirytime.millis":"600000","dfs.namenode.backup.address":"0.0.0.0:50100","dfs.datanode.data.dir":"file://${hadoop.tmp.dir}/dfs/data","dfs.datanode.shared.file.descriptor.paths":"/dev/shm,/tmp","dfs.replication":"3","mapreduce.jobtracker.jobhistory.block.size":"3145728","dfs.encrypt.data.transfer.cipher.key.bitlength":"128","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","dfs.secondary.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","mapreduce.task.profile.maps":"0-2","dfs.datanode.block-pinning.enabled":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","mapreduce.jobtracker.retiredjobs.cache.size":"1000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"test-1.vpc.company.com:19890","fs.trash.checkpoint.interval":"0","dfs.namenode.checkpoint.check.period":"60","yarn.nodemanager.container-monitor.interval-ms":"3000","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","hadoop.proxyuser.HTTP.groups":"*","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"10000","dfs.namenode.reencrypt.edek.threads":"10","yarn.resourcemanager.system-metrics-publisher.enabled":"false","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","dfs.namenode.fs-limits.max-component-length":"255","mapreduce.tasktracker.taskcontroller":"org.apache.hadoop.mapred.DefaultTaskController","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","dfs.namenode.top.window.num.buckets":"10","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","dfs.datanode.block.id.layout.upgrade.threads":"12","mapreduce.jobtracker.tasktracker.maxblacklists":"4","yarn.nodemanager.docker-container-executor.exec-name":"/usr/bin/docker","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"1000","hadoop.common.configuration.version":"0.23.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","dfs.namenode.decommission.max.concurrent.tracked.nodes":"100","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","dfs.client.read.shortcircuit.skip.checksum":"false","mapreduce.task.profile.reduces":"0-2","dfs.datanode.address":"0.0.0.0:50010","dfs.https.server.keystore.resource":"ssl-server.xml","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"test-1.vpc.company.com:8030","mapreduce.task.timeout":"600000","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://test-1.vpc.company.com:8020","fs.har.impl.disable.cache":"true","io.compression.codec.bzip2.library":"system-native","dfs.namenode.audit.loggers":"default","dfs.block.access.key.update.interval":"600","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"test-1.vpc.company.com:8090","dfs.namenode.max.objects":"0","mapreduce.jobhistory.address":"test-1.vpc.company.com:10020","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:0","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","dfs.namenode.max.full.block.report.leases":"6","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","dfs.client.mmap.cache.timeout.ms":"3600000","dfs.mover.max-no-move-interval":"60000","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","dfs.client.datanode-restart.timeout":"30","dfs.datanode.drop.cache.behind.reads":"false","ipc.server.log.slow.rpc":"false","dfs.namenode.read-lock-reporting-threshold-ms":"5000","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.nodemanager.default-container-executor.log-dirs.permissions":"710","dfs.namenode.checkpoint.edits.dir":"${dfs.namenode.checkpoint.dir}","yarn.app.attempt.diagnostics.limit.kc":"64","dfs.balancer.block-move.timeout":"0","dfs.client.block.write.replace-datanode-on-failure.enable":"true","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"8192","io.compression.codecs":"org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec","fs.s3a.fast.upload.active.blocks":"4","hadoop.security.credential.clear-text-fallback":"true","dfs.heartbeat.interval":"3","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","dfs.ha.tail-edits.period":"60","dfs.datanode.max.locked.memory":"0","dfs.datanode.scan.period.hours":"504","mapreduce.jobtracker.expire.trackers.interval":"600000","yarn.resourcemanager.nodemanager-connect-retries":"10","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"false","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","dfs.namenode.edits.noeditlogchannelflush":"false","mapreduce.task.io.sort.factor":"64","mapreduce.tasktracker.outofband.heartbeat":"false","ha.failover-controller.new-active.rpc-timeout.ms":"60000","dfs.webhdfs.ugi.expire.after.access":"600000","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"test-1.vpc.company.com:20102","yarn.am.blacklisting.enabled":"true","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","dfs.image.transfer-bootstrap-standby.bandwidthPerSec":"0","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"20","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.proxyuser.hive.groups":"*","ha.zookeeper.session-timeout.ms":"5000","mapreduce.map.java.opts":"-Djava.net.preferIPv4Stack=true","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","dfs.ha.automatic-failover.enabled":"false","mapreduce.job.counters.groups.max":"50","dfs.namenode.decommission.interval":"30","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","dfs.datanode.handler.count":"10","dfs.namenode.xattrs.enabled":"true","dfs.namenode.safemode.threshold-pct":"0.999f","dfs.client.socket.send.buffer.size":"0","mapreduce.map.sort.spill.percent":"0.8","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","dfs.datanode.sync.behind.writes":"false","dfs.namenode.stale.datanode.interval":"30000","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","dfs.datanode.transfer.socket.send.buffer.size":"0","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","dfs.namenode.reencrypt.throttle.limit.handler.ratio":"1.0","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler","yarn.app.mapreduce.am.command-opts":"-Djava.net.preferIPv4Stack=true -Xmx825955249","dfs.journalnode.https-address":"0.0.0.0:8481","mapreduce.cluster.local.dir":"${hadoop.tmp.dir}/mapred/local","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","dfs.user.home.dir.prefix":"/user","hadoop.proxyuser.hue.groups":"*","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","mapreduce.jobtracker.persist.jobstatus.dir":"/jobtracker/jobsInfo","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","dfs.namenode.snapshot.capture.openfiles":"false","dfs.datanode.directoryscan.interval":"21600","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","dfs.client.read.shortcircuit.streams.cache.expiry.ms":"300000","fs.s3a.connection.timeout":"200000","mapreduce.job.max.split.locations":"10","dfs.namenode.write.stale.datanode.ratio":"0.5f","hadoop.registry.zk.session.timeout.ms":"60000","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","mapreduce.jobtracker.taskcache.levels":"2","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","mapreduce.job.emit-timeline-data":"false","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","dfs.domain.socket.path":"/var/run/hdfs-sockets/dn","yarn.resourcemanager.admin.client.thread-count":"1","mapreduce.jobtracker.persist.jobstatus.active":"true","dfs.namenode.reencrypt.sleep.interval":"1m","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"test-1.vpc.company.com:10033","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","mapreduce.tasktracker.report.address":"127.0.0.1:0","dfs.namenode.edit.log.autoroll.multiplier.threshold":"2.0","io.seqfile.lazydecompress":"true","ftp.blocksize":"67108864","dfs.namenode.backup.http-address":"0.0.0.0:50105","dfs.disk.balancer.max.disk.throughputInMBperSec":"10","mapreduce.jobtracker.instrumentation":"org.apache.hadoop.mapred.JobTrackerMetricsInst","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.delete.debug-delay-sec":"0","mapreduce.jobtracker.http.address":"0.0.0.0:50030","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","hadoop.security.groups.cache.secs":"300","yarn.resourcemanager.zk-retry-interval-ms":"1000","yarn.scheduler.increment-allocation-mb":"512","nfs.mountd.port":"4242","mapreduce.shuffle.max.threads":"0","hadoop.security.authorization":"false","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","mapreduce.jobhistory.http.policy":"HTTP_ONLY","dfs.datanode.dns.interface":"default","mapreduce.reduce.java.opts":"-Djava.net.preferIPv4Stack=true","s3native.replication":"3","hadoop.security.group.mapping.ldap.ssl":"false","dfs.namenode.fs-limits.max-xattrs-per-inode":"32","yarn.client.application-client-protocol.poll-interval-ms":"200","hadoop.proxyuser.flume.groups":"*","dfs.namenode.fs-limits.max-xattr-size":"16384","dfs.namenode.maintenance.replication.min":"1","dfs.client.write.exclude.nodes.cache.expiry.interval.millis":"600000","ha.zookeeper.parent-znode":"/hadoop-ha","dfs.namenode.safemode.extension":"30000","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","dfs.blocksize":"134217728","dfs.namenode.servicerpc-address":"test-1.vpc.company.com:8022","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"test-1.vpc.company.com:8032","ipc.client.ping":"true","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","dfs.namenode.resource.checked.volumes.minimum":"1","hadoop.proxyuser.HTTP.hosts":"*","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","dfs.namenode.quota.init-threads":"4","dfs.journalnode.http-address":"0.0.0.0:8480","dfs.block.scanner.volume.bytes.per.second":"1048576","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","dfs.storage.policy.enabled":"true","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","dfs.namenode.edits.dir":"${dfs.namenode.name.dir}","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","dfs.namenode.support.allow.format":"true","dfs.ha.tail-edits.rolledits.timeout":"60","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","mapreduce.cluster.temp.dir":"${hadoop.tmp.dir}/mapred/temp","s3.replication":"3","dfs.client.failover.connection.retries":"0","hadoop.tmp.dir":"/tmp/hadoop-${user.name}","mapreduce.job.maps":"2","dfs.namenode.secondary.http-address":"0.0.0.0:50090","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"50","nfs.wtmax":"1048576","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","nfs.dump.dir":"/tmp/.hdfs-nfs","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","mapreduce.client.submit.file.replication":"3","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"128M","dfs.namenode.service.handler.count":"10","dfs.datanode.data.dir.perm":"700","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"file:///dataroot/dataroot/dfs/nn","yarn.resourcemanager.zk-acl":"world:anyone:rwcda","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","mapreduce.reduce.input.buffer.percent":"0.0","fs.ftp.host.port":"21","ipc.ping.interval":"60000","dfs.namenode.num.checkpoints.retained":"2","dfs.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","yarn.resourcemanager.admin.address":"test-1.vpc.company.com:8033","file.client-write-packet-size":"65536","hadoop.treat.subject.external":"false","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"false","dfs.disk.balancer.plan.threshold.percent":"10","mapreduce.local.clientfactory.class.name":"org.apache.hadoop.mapred.LocalClientFactory","dfs.client.use.legacy.blockreader":"false","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"256","yarn.nodemanager.localizer.client.thread-count":"5","dfs.namenode.checkpoint.max-retries":"3","dfs.namenode.reject-unresolved-dn-topology-mapping":"false","dfs.namenode.delegation.token.max-lifetime":"*********(redacted)","dfs.client.block.write.replace-datanode-on-failure.min-replication":"0","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec,org.apache.hadoop.crypto.JceAesCtrCryptoCodec","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"2000","dfs.namenode.num.extra.edits.retained":"1000000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","yarn.scheduler.increment-allocation-vcores":"1","fs.df.interval":"60000","fs.s3.sleepTimeSeconds":"10","fs.s3a.multiobjectdelete.enable":"true","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"true","io.file.buffer.size":"65536","hadoop.work.around.non.threadsafe.getpwuid":"false","dfs.permissions.superusergroup":"supergroup","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","mapreduce.tasktracker.dns.interface":"default","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","dfs.permissions.enabled":"true","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.scheduler.minimum-allocation-mb":"1024","yarn.app.mapreduce.am.staging-dir":"/user","mapreduce.reduce.shuffle.read.timeout":"180000","yarn.app.mapreduce.am.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","dfs.datanode.https.address":"0.0.0.0:50475","dfs.namenode.hosts.provider.classname":"org.apache.hadoop.hdfs.server.blockmanagement.HostFileManager","dfs.datanode.transfer.socket.recv.buffer.size":"0","fs.s3a.connection.establish.timeout":"5000","dfs.namenode.fslock.fair":"true","mapreduce.job.running.map.limit":"0","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","hadoop.fuse.connection.timeout":"300","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"5000","yarn.nodemanager.vmem-pmem-ratio":"2.1","dfs.client.slow.io.warning.threshold.ms":"30000","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","s3native.stream-buffer-size":"4096","yarn.nodemanager.remote-app-log-dir":"/tmp/logs","fs.s3a.s3guard.cli.prune.age":"86400000","dfs.client.read.shortcircuit.streams.cache.size":"256","dfs.client.use.legacy.blockreader.local":"false","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3n.multipart.uploads.enabled":"false","dfs.namenode.path.based.cache.retry.interval.ms":"30000","hadoop.security.crypto.buffer.size":"8192","yarn.client.failover-retries-on-socket-timeouts":"0","dfs.balancer.keytab.enabled":"false","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","dfs.datanode.balance.bandwidthPerSec":"10485760","dfs.namenode.name.dir.restore":"false","hadoop.registry.jaas.context":"Client","dfs.client.failover.sleep.max.millis":"15000","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","s3.blocksize":"67108864","yarn.am.blacklisting.disable-failure-threshold":"0.8f","io.map.index.interval":"128","mapreduce.job.counters.max":"120","dfs.namenode.max-lock-hold-to-release-lease-ms":"25","dfs.namenode.datanode.registration.ip-hostname-check":"true","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","dfs.namenode.resource.du.reserved":"104857600","dfs.datanode.bp-ready.timeout":"20","yarn.nodemanager.localizer.fetch.thread-count":"4","yarn.resourcemanager.scheduler.client.thread-count":"50","hadoop.ssl.hostname.verifier":"DEFAULT","dfs.namenode.full.block.report.lease.length.ms":"300000","mapreduce.tasktracker.instrumentation":"org.apache.hadoop.mapred.TaskTrackerMetricsInst","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","s3.stream-buffer-size":"4096","yarn.resourcemanager.nm.liveness-monitor.interval-ms":"1000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","dfs.namenode.secondary.https-address":"0.0.0.0:50091","s3native.bytes-per-checksum":"512","dfs.namenode.fs-limits.max-directory-items":"1048576","nfs.server.port":"2049","dfs.namenode.delegation.token.renew-interval":"*********(redacted)","mapreduce.jobtracker.address":"local","yarn.nodemanager.recovery.enabled":"false","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","dfs.namenode.list.openfiles.num.responses":"1000","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","dfs.namenode.blocks.per.postponedblocks.rescan":"10000","dfs.namenode.checkpoint.period":"3600","hadoop.security.groups.cache.background.reload":"false","yarn.resourcemanager.amlauncher.log.command":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","dfs.namenode.snapshot.skip.capture.accesstime-only-change":"false","dfs.webhdfs.user.provider.user.pattern":"^[A-Za-z_][A-Za-z0-9._-]*[$]?$","dfs.webhdfs.acl.provider.permission.pattern":"^(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?(,(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?)*$","mapreduce.fileoutputcommitter.algorithm.version":"1","yarn.resourcemanager.work-preserving-recovery.enabled":"false","mapreduce.map.skip.maxrecords":"0","mapreduce.jobtracker.handler.count":"10","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"1","hadoop.proxyuser.flume.hosts":"*","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","fs.s3n.block.size":"67108864","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","mapreduce.jobtracker.staging.root.dir":"${hadoop.tmp.dir}/mapred/staging","dfs.namenode.http-address":"test-1.vpc.company.com:20101","mapreduce.jobtracker.jobhistory.lru.cache.size":"5","dfs.datanode.directoryscan.threads":"1","dfs.datanode.fsdatasetcache.max.threads.per.volume":"4","dfs.namenode.fs-limits.max-blocks-per-file":"1048576","dfs.disk.balancer.enabled":"false","mapreduce.shuffle.listen.queue.size":"128","mapreduce.tasktracker.local.dir.minspacestart":"0","mapreduce.map.cpu.vcores":"1","hadoop.user.group.static.mapping.overrides":"dr.who=;","dfs.datanode.cache.revocation.timeout.ms":"900000","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","dfs.client.mmap.cache.size":"256","dfs.ha.log-roll.period":"120","dfs.client.failover.sleep.base.millis":"500","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","dfs.namenode.accesstime.precision":"3600000","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","hadoop.registry.zk.retry.interval.ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","dfs.client.context":"default","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"localhost:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","hadoop.ssl.server.conf":"ssl-server.xml","dfs.http.policy":"HTTP_ONLY","dfs.client.https.keystore.resource":"ssl-client.xml","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.acl.enable":"true","dfs.domain.socket.disable.interval.seconds":"1","dfs.image.transfer.chunksize":"65536","dfs.balancer.max-no-move-interval":"60000","mapreduce.tasktracker.map.tasks.maximum":"2","dfs.namenode.edits.journal-plugin.qjournal":"org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"nobody","dfs.namenode.list.encryption.zones.num.responses":"100","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","dfs.namenode.top.num.users":"10","dfs.disk.balancer.block.tolerance.percent":"10","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"${yarn.log.dir}/userlogs","io.seqfile.sorter.recordlimit":"1000000","hadoop.security.auth_to_local":"DEFAULT","dfs.blockreport.initialDelay":"0","fs.automatic.close":"true","dfs.client.block.write.replace-datanode-on-failure.best-effort":"false","dfs.namenode.replication.min":"1","dfs.balancer.address":"0.0.0.0:0","fs.s3n.multipart.copy.block.size":"5368709120","yarn.nodemanager.hostname":"0.0.0.0","nfs.rtmax":"1048576","yarn.resourcemanager.zk-timeout-ms":"10000","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","dfs.datanode.directoryscan.throttle.limit.ms.per.sec":"1000","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"2","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.namenode.reencrypt.throttle.limit.updater.ratio":"1.0","dfs.namenode.avoid.write.stale.datanode":"false","dfs.short.circuit.shared.memory.watcher.interrupt.check.ms":"60000","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"0.75f","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","dfs.client.mmap.enabled":"true","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","fs.client.resolve.remote.symlinks":"true","dfs.image.compression.codec":"org.apache.hadoop.io.compress.DefaultCodec","mapreduce.jobtracker.restart.recover":"false","dfs.namenode.decommission.blocks.per.interval":"500000","mapreduce.tasktracker.reduce.tasks.maximum":"2","yarn.nodemanager.local-dirs":"${hadoop.tmp.dir}/nm-local-dir","mapreduce.shuffle.connection-keep-alive.enable":"false","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","dfs.namenode.safemode.min.datanodes":"0","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"24","yarn.scheduler.maximum-allocation-vcores":"4","yarn.nodemanager.log-aggregation.compression-type":"none","dfs.namenode.enable.retrycache":"true","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","dfs.namenode.startup.delay.block.deletion.sec":"0","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","dfs.datanode.readahead.bytes":"4194304","mapreduce.jobtracker.heartbeats.in.second":"100","mapreduce.job.running.reduce.limit":"0","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","mapreduce.task.tmp.dir":"./tmp","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:mapred@hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","dfs.namenode.delegation.key.update-interval":"86400000","fs.s3a.max.total.tasks":"5","dfs.client.file-block-storage-locations.num-threads":"10","mapreduce.tasktracker.healthchecker.interval":"60000","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.fast.upload":"false","fs.s3a.attempts.maximum":"20","dfs.namenode.avoid.read.stale.datanode":"false","hadoop.registry.zk.connection.timeout.ms":"15000","dfs.https.port":"20102","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","mapreduce.map.log.level":"INFO","mapreduce.output.fileoutputformat.compress.type":"BLOCK","hadoop.registry.rm.enabled":"false","mapreduce.ifile.readahead.bytes":"4194304","mapreduce.tasktracker.tasks.sleeptimebeforesigkill":"5000","yarn.resourcemanager.fs.state-store.retry-policy-spec":"2000, 500","dfs.namenode.posix.acl.inheritance.enabled":"false","dfs.blockreport.intervalMsec":"21600000","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","dfs.namenode.path.based.cache.refresh.interval.ms":"30000","dfs.namenode.edekcacheloader.interval.ms":"1000","file.stream-buffer-size":"4096","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"false","dfs.disk.balancer.max.disk.errors":"5","dfs.datanode.use.datanode.hostname":"false","mapreduce.job.speculative.retry-after-speculate":"15000","hadoop.proxyuser.hdfs.hosts":"*","dfs.namenode.fs-limits.min-block-size":"1048576","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","mapreduce.job.reduce.slowstart.completedmaps":"0.8","dfs.client.read.shortcircuit":"false","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","hadoop.proxyuser.yarn.groups":"*","dfs.client.cached.conn.retry":"3","dfs.namenode.invalidate.work.pct.per.iteration":"0.32f","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.nodemanager.logaggregation.threadpool-size-max":"100","dfs.replication.max":"512","dfs.namenode.inotify.max.events.per.rpc":"1000","yarn.resourcemanager.hostname":"0.0.0.0","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"0","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","ipc.client.rpc-timeout.ms":"0","fs.s3.maxRetries":"4","dfs.default.chunk.view.size":"32768","mapreduce.input.lineinputformat.linespermap":"1","ipc.client.connect.max.retries.on.timeouts":"45","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","dfs.client.mmap.retry.timeout.ms":"300000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","dfs.namenode.list.cache.directives.num.responses":"100","fs.s3a.socket.recv.buffer":"8192","dfs.image.compress":"false","dfs.namenode.kerberos.principal.pattern":"*","yarn.application.classpath":"$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*","fs.s3n.multipart.uploads.block.size":"67108864","mapreduce.tasktracker.http.address":"0.0.0.0:50060","yarn.resourcemanager.resource-tracker.address":"test-1.vpc.company.com:8031","hadoop.fuse.timer.period":"5","mapreduce.job.heap.memory-mb.ratio":"0.8","dfs.datanode.hdfs-blocks-metadata.enabled":"true","dfs.namenode.checkpoint.dir":"file://${hadoop.tmp.dir}/dfs/namesecondary","dfs.datanode.max.transfer.threads":"4096","dfs.namenode.edits.asynclogging":"true","nfs.allow.insecure.ports":"true","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"0","s3native.client-write-packet-size":"65536","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","yarn.timeline-service.hostname":"0.0.0.0","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","hadoop.proxyuser.mapred.hosts":"*","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","hadoop.proxyuser.mapred.groups":"*","yarn.resourcemanager.keytab":"/etc/krb5.keytab","mapreduce.reduce.merge.inmem.threshold":"1000","dfs.client.https.need-auth":"false","dfs.blockreport.split.threshold":"1000000","dfs.client.block.write.replace-datanode-on-failure.policy":"DEFAULT","mapreduce.shuffle.ssl.enabled":"false","dfs.namenode.write-lock-reporting-threshold-ms":"5000","dfs.block.access.token.enable":"*********(redacted)","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","httpfs.buffer.size":"4096","dfs.client.file-block-storage-locations.timeout.millis":"1000","dfs.namenode.block-placement-policy.default.prefer-local-node":"true","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.log-aggregation.retain-seconds":"-1","dfs.namenode.replication.considerLoad":"true","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.proxyuser.hdfs.groups":"*","dfs.namenode.retrycache.heap.percent":"0.03f","dfs.datanode.cache.revocation.polling.ms":"500","mapreduce.jobhistory.webapp.address":"test-1.vpc.company.com:19888","dfs.namenode.path.based.cache.block.map.allocation.percent":"0.25","mapreduce.jobtracker.system.dir":"${hadoop.tmp.dir}/mapred/system","mapreduce.tasktracker.taskmemorymanager.monitoringinterval":"5000","dfs.journalnode.rpc-address":"0.0.0.0:8485","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"test-1.vpc.company.com:8088","mapreduce.jobhistory.recovery.enable":"false","dfs.client.short.circuit.replica.stale.threshold.ms":"1800000","mapreduce.reduce.shuffle.parallelcopies":"10","fs.trash.interval":"1","dfs.namenode.replication.interval":"3","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","dfs.namenode.top.enabled":"true","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"0","yarn.app.mapreduce.am.resource.mb":"1024","mapreduce.input.fileinputformat.list-status.num-threads":"1","dfs.namenode.lazypersist.file.scrub.interval.sec":"300","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"6","fs.s3a.multipart.size":"64M","yarn.scheduler.minimum-allocation-vcores":"1","dfs.namenode.reencrypt.batch.size":"1000","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","dfs.datanode.http.address":"0.0.0.0:50075","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","fs.s3a.metadatastore.authoritative":"false","ha.health-monitor.sleep-after-disconnect.ms":"1000","s3.bytes-per-checksum":"512","yarn.app.mapreduce.shuffle.log.limit.kb":"0","dfs.namenode.list.cache.pools.num.responses":"100","hadoop.security.group.mapping":"org.apache.hadoop.security.ShellBasedUnixGroupsMapping","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","dfs.encrypt.data.transfer":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","mapreduce.job.redacted-properties":"*********(redacted)","dfs.namenode.top.windows.minutes":"1,5,25","s3.client-write-packet-size":"65536","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","ipc.server.max.connections":"0","yarn.scheduler.maximum-allocation-mb":"12288","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.app.mapreduce.am.container.log.limit.kb":"0","s3native.blocksize":"67108864","ipc.client.connect.retry.interval":"1000","hadoop.proxyuser.httpfs.groups":"*","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","dfs.namenode.edit.log.autoroll.check.interval.ms":"300000","mapreduce.jobhistory.cleaner.enable":"true","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","dfs.client.use.datanode.hostname":"false","dfs.stream-buffer-size":"4096","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","dfs.datanode.drop.cache.behind.writes":"false","mapreduce.tasktracker.dns.nameserver":"default","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","mapreduce.job.end-notification.retry.attempts":"0","hadoop.proxyuser.yarn.hosts":"*","yarn.resourcemanager.zk-num-retries":"1000","dfs.client.failover.max.attempts":"15","mapreduce.tasktracker.indexcache.mb":"10","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","dfs.namenode.snapshotdiff.allow.snap-root-descendant":"true","yarn.nodemanager.localizer.cache.target-size-mb":"10240","zlib.compress.level":"DEFAULT_COMPRESSION","ftp.client-write-packet-size":"65536","mapreduce.jobtracker.maxtasks.perjob":"-1","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","dfs.block.access.token.lifetime":"*********(redacted)","dfs.namenode.max.extra.edits.segments.retained":"10000","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","dfs.image.transfer.bandwidthPerSec":"0","io.native.lib.available":"true","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","ipc.server.listen.queue.size":"128","dfs.namenode.edekcacheloader.initial.delay.ms":"3000","map.sort.class":"org.apache.hadoop.util.QuickSort","dfs.namenode.acls.enabled":"false","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","dfs.datanode.ipc.address":"0.0.0.0:50020","yarn.nodemanager.vmem-check-enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","dfs.lock.suppress.warning.interval":"10s","dfs.client.block.write.retries":"3","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.application.classpath":"$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH","yarn.client.nodemanager-connect.retry-interval-ms":"10000","dfs.client-write-packet-size":"65536","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME","dfs.datanode.dns.nameserver":"default","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","dfs.image.transfer.timeout":"60000","yarn.resourcemanager.recovery.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240","dfs.client.failover.connection.retries.on.timeouts":"0"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.security.egd":"file:///dev/urandom","java.vm.specification.version":"1.8","user.home":"/home/systest","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/java/jdk1.8.0_144/jre/lib/amd64","user.dir":"/tmp","java.library.path":":/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.144-b01","jetty.git.hash":"27208684755d94a92186989f695db2d7b21ebc51","java.endorsed.dirs":"/usr/java/jdk1.8.0_144/jre/lib/endorsed","java.runtime.version":"1.8.0_144-b01","java.vm.info":"mixed mode","java.ext.dirs":"/usr/java/jdk1.8.0_144/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/java/jdk1.8.0_144/jre/lib/resources.jar:/usr/java/jdk1.8.0_144/jre/lib/rt.jar:/usr/java/jdk1.8.0_144/jre/lib/sunrsasign.jar:/usr/java/jdk1.8.0_144/jre/lib/jsse.jar:/usr/java/jdk1.8.0_144/jre/lib/jce.jar:/usr/java/jdk1.8.0_144/jre/lib/charsets.jar:/usr/java/jdk1.8.0_144/jre/lib/jfr.jar:/usr/java/jdk1.8.0_144/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Los_Angeles","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"3.10.0-514.26.2.el7.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"systest","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode client --conf spark.driver.memory=2g --conf spark.executor.heartbeatInterval=1000 --conf spark.executor.metrics.pollingInterval=100 --conf spark.eventLog.logStageExecutorProcessTreeMetrics.enabled=true --conf spark.yarn.maxAppAttempts=1 --conf spark.locality.wait.process=0 --conf spark.executor.memoryOverhead=1024 --conf spark.executor.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --conf spark.eventLog.logStageExecutorMetrics=true --conf spark.driver.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --class com.company.spark.LargeBlocks --num-executors 3 --executor-memory 7g /tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar --targetBlockSizeGb 2.5 --taskSleepMillis 200 --doCache true --cacheOnDisk true --replicas 1 --concurrentReadJobs 2","java.home":"/usr/java/jdk1.8.0_144/jre","java.version":"1.8.0_144","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/opt/cloudera/parcels/CDH/jars/jackson-mapper-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/joni-2.1.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-external-blockcache-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-json-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/xz-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-annotations-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/cglib-2.2.1-v20090111.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-procedure-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-jackson-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/asm-3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xml-apis-1.3.04.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-jvm-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-dbcp-1.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-streaming_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-xml-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/minlog-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hsqldb-1.8.0.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pmml-model-1.4.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire-macros_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zstd-jni-1.3.2-2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpcore-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jta-1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-yarn_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-logging-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/activation-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xbean-asm7-shaded-4.12.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/NOTICE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-format-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpclient-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-3.9.9.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/htrace-core-3.1.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-nativetask-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-client-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-xc-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-configuration-1.6.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math3-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsp-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-auth-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-registry-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/JavaEWAH-0.3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-graphite-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/joda-time-2.9.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-gridmix-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-compress-1.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javolution-5.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-1.7.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/core-1.1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-runtime-3.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-net-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-framework-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-java-1.1.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-assembly_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-examples-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/shapeless_2.12-2.3.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xercesImpl-2.9.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-server-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang-2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jtransforms-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-0.2.jar":"System Classpath","/etc/spark2/conf/yarn-conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-cli-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.annotation-api-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0-2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javassist-3.18.1-GA.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-applicationhistoryservice-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka-clients-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guice-3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/core-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/opencsv-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-datajoin-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-common-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jdo-api-3.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-webapp-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compiler-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-annotations-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/libthrift-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka_2.11-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-proxy-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-2.7.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jettison-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libfb303-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/zookeeper-3.4.5-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-resource-bundle-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jaxb-api-2.2.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-streaming-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop2-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pyrolite-4.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/activation-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aircompressor-0.10.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-ant-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jline-2.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-all-4.0.23.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlets-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-xml_2.12-1.0.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-graphx_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-plugins-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-web-proxy-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-kvstore_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/objenesis-2.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stream-2.7.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-aws-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-scala_2.12-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/spymemcached-2.11.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-util-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-core-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/mockito-all-1.8.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jets3t-0.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-xc-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rsgroup-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-metastore-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/py4j-0.10.8.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-distributedshell-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-rumen-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsch-0.1.42.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/zkclient-0.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xmlenc-0.52.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-io-2.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-core-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jamon-runtime-2.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-common-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-common-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-3.10.5.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-app-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/machinist_2.12-0.6.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-shuffle_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill_2.12-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/findbugs-annotations-1.3.9-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rest-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ST4-4.0.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jruby-cloudera-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-hive_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/leveldbjni-all-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-core_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-distcp-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-server-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-mapreduce-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-digester-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-http-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-jackson_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-math3-3.4.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-log4j12-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-jobclient-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-shell-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-server-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill-java-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-parser-combinators_2.12-1.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-api-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/log4j-1.2.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-compiler-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-security-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-shuffle-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-all-4.1.30.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze-macros_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jodd-core-3.5.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/azure-data-lake-store-sdk-2.2.9.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-12.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-codec-1.10.jar":"System Classpath","/usr/java/jdk1.8.0_144/lib/tools.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-nodemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sql_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-exec-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jcodings-1.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-datalake-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-11.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-catalyst_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-io-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-unmanaged-am-launcher-2.6.0-cdh5.15.2.jar":"System Classpath","/etc/spark2/conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpclient-4.5.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-continuation-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-ast_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-jndi-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsr305-3.0.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/avro-1.7.6-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/univocity-parsers-2.7.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-annotations-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jettison-1.3.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hppc-0.7.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/logredactor-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-vector-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-impl-2.2.3-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archives-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-plus-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-client-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hue-plugins-3.9.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/high-scale-lib-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-collections-3.2.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-library-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/java-xmlbuilder-0.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apache-log4j-extras-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-format-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/disruptor-3.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib-local_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-pool-1.5.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlet-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arpack_combined_all-0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-annotations-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/log4j-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-repl_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/derby-10.12.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-shims-1.5.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-core-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-daemon-1.0.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ivy-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hamcrest-core-1.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang3-3.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/spark-streaming-kafka-0-8_2.11-2.4.0.cloudera1-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-databind-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/kryo-shaded-4.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-reflect-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-examples-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpcore-4.4.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-memory-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/LICENSE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compress-1.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-client-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/oro-2.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-thrift-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-ipc-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-httpclient-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-encoding-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okhttp-2.4.0.jar":"System Classpath","spark://test-1.vpc.company.com:34194/jars/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar":"Added By User","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-compiler-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-prefix-tree-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-api-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/lz4-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/janino-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libthrift-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-launcher_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-core_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/lz4-java-1.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-jaxrs-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-common_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-unsafe_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-sls-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/snappy-java-1.0.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-guava-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/validation-api-1.1.0.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/microsoft-windowsazure-storage-sdk-0.6.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/compress-lzf-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-openstack-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-client-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-it-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-extras-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-recipes-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-column-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/unused-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-crypto-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-databind-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core4-4.0.1-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guava-14.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/aws-java-sdk-bundle-1.11.134.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zookeeper-3.4.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-paranamer-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-protocol-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-runtime-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-scalap_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-logging-1.1.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-resourcemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archive-logs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/paranamer-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-beanutils-1.9.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stringtemplate-3.2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/gson-2.2.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okio-1.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-api-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsr305-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sketch_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-el-1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-tags_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-core-3.2.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/protobuf-java-2.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-bundle-1.6.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xz-1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-client-2.6.0-cdh5.15.2.jar":"System Classpath"}}
 {"Event":"SparkListenerApplicationStart","App Name":"LargeBlocks","App ID":"application_1553914137147_0018","Timestamp":1554755984286,"User":"systest"}
 {"Event":"SparkListenerExecutorAdded","Timestamp":1554755994596,"Executor ID":"1","Executor Info":{"Host":"test-2.vpc.company.com","Total Cores":1,"Log Urls":{"stdout":"http://test-2.vpc.company.com:8042/node/containerlogs/container_1553914137147_0018_01_000002/systest/stdout?start=-4096","stderr":"http://test-2.vpc.company.com:8042/node/containerlogs/container_1553914137147_0018_01_000002/systest/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"test-2.vpc.company.com:8042","USER":"systest","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"test-2.vpc.company.com","CONTAINER_ID":"container_1553914137147_0018_01_000002"}}}
 {"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"test-2.vpc.company.com","Port":43764},"Maximum Memory":3820172083,"Timestamp":1554755994649,"Maximum Onheap Memory":3820172083,"Maximum Offheap Memory":0}
diff --git a/docs/configuration.md b/docs/configuration.md
index 6aafc9110652b..df1cbe4c2ebea 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -194,7 +194,7 @@ of the most common options to set are:
   </td>
 </tr>
 <tr>
- <td><code>spark.resources.coordinate.enable</code></td>
+ <td><code>spark.resources.coordinateResourcesInStandalone</code></td>
   <td>true</td>
   <td>
     Whether to coordinate resources automatically among workers/drivers(client only) 
@@ -230,7 +230,7 @@ of the most common options to set are:
     write to STDOUT a JSON string in the format of the ResourceInformation class. This has a
     name and an array of addresses. For a client-submitted driver in Standalone, discovery
     script must assign different resource addresses to this driver comparing to workers' and
-    other drivers' when <code>spark.resources.coordinate.enable</code> is off.
+    other drivers' when <code>spark.resources.coordinateResourcesInStandalone</code> is off.
   </td>
 </tr>
 <tr>
@@ -1641,7 +1641,7 @@ Apart from these, the following properties are also available, and may be useful
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.eventLog.logStageExecutorMetrics.enabled</code></td>
+  <td><code>spark.eventLog.logStageExecutorMetrics</code></td>
   <td>false</td>
   <td>
     Whether to write per-stage peaks of executor metrics (for each executor) to the event log.
@@ -2755,5 +2755,5 @@ There are configurations available to request resources for the driver: <code>sp
 
 Spark will use the configurations specified to first request containers with the corresponding resources from the cluster manager. Once it gets the container, Spark launches an Executor in that container which will discover what resources the container has and the addresses associated with each resource. The Executor will register with the Driver and report back the resources available to that Executor. The Spark scheduler can then schedule tasks to each Executor and assign specific resource addresses based on the resource requirements the user specified. The user can see the resources assigned to a task using the <code>TaskContext.get().resources</code> api. On the driver, the user can see the resources assigned with the SparkContext <code>resources</code> call. It's then up to the user to use the assignedaddresses to do the processing they want or pass those into the ML/AI framework they are using.
 
-See your cluster manager specific page for requirements and details on each of - [YARN](running-on-yarn.html#resource-allocation-and-configuration-overview), [Kubernetes](running-on-kubernetes.html#resource-allocation-and-configuration-overview) and [Standalone Mode](spark-standalone.html#resource-allocation-and-configuration-overview). It is currently not available with Mesos or local mode. If using local-cluster mode see the Spark Standalone documentation but be aware only a single worker resources file or discovery script can be specified the is shared by all the Workers so you should enable resource coordination (see <code>spark.resources.coordinate.enable</code>).
+See your cluster manager specific page for requirements and details on each of - [YARN](running-on-yarn.html#resource-allocation-and-configuration-overview), [Kubernetes](running-on-kubernetes.html#resource-allocation-and-configuration-overview) and [Standalone Mode](spark-standalone.html#resource-allocation-and-configuration-overview). It is currently not available with Mesos or local mode. If using local-cluster mode see the Spark Standalone documentation but be aware only a single worker resources file or discovery script can be specified the is shared by all the Workers so you should enable resource coordination (see <code>spark.resources.coordinateResourcesInStandalone</code>).
 
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 3061c6761cdcf..c30aa99679399 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -659,7 +659,7 @@ A list of the available metrics, with a short description:
 
 Executor-level metrics are sent from each executor to the driver as part of the Heartbeat to describe the performance metrics of Executor itself like JVM heap memory, GC information.
 Executor metric values and their measured peak values per executor are exposed via the REST API at the end point `/applications/[app-id]/executors`.
-In addition, aggregated per-stage peak values of the executor metrics are written to the event log if `spark.eventLog.logStageExecutorMetrics.enabled` is true.
+In addition, aggregated per-stage peak values of the executor metrics are written to the event log if `spark.eventLog.logStageExecutorMetrics` is true.
 Executor metrics are also exposed via the Spark metrics system based on the Dropwizard metrics library.
 A list of the available metrics, with a short description:
 
diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md
index f7f20389aa694..03e062f002680 100644
--- a/docs/pyspark-migration-guide.md
+++ b/docs/pyspark-migration-guide.md
@@ -34,7 +34,7 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide.
 
   - In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`.
 
-  - In PySpark, when Arrow optimization is enabled, if Arrow version is higher than 0.11.0, Arrow can perform safe type conversion when converting Pandas.Series to Arrow array during serialization. Arrow will raise errors when detecting unsafe type conversion like overflow. Setting `spark.sql.execution.pandas.arrowSafeTypeConversion` to true can enable it. The default setting is false. PySpark's behavior for Arrow versions is illustrated in the table below:
+  - In PySpark, when Arrow optimization is enabled, if Arrow version is higher than 0.11.0, Arrow can perform safe type conversion when converting Pandas.Series to Arrow array during serialization. Arrow will raise errors when detecting unsafe type conversion like overflow. Setting `spark.sql.execution.pandas.convertToArrowArraySafely` to true can enable it. The default setting is false. PySpark's behavior for Arrow versions is illustrated in the table below:
     <table class="table">
         <tr>
           <th>
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 1264951a2f270..ff310edb5a633 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -256,7 +256,7 @@ SPARK_MASTER_OPTS supports the following system properties:
   <td>
     Path to resource discovery script, which is used to find a particular resource while worker starting up.
     And the output of the script should be formatted like the <code>ResourceInformation</code> class.
-    When <code>spark.resources.coordinate.enable</code> is off, the discovery script must assign different
+    When <code>spark.resources.coordinateResourcesInStandalone</code> is off, the discovery script must assign different
     resources for workers and drivers in client mode that run on the same host to avoid resource conflict.
   </td>
 </tr>
@@ -267,7 +267,7 @@ SPARK_MASTER_OPTS supports the following system properties:
     Path to resources file which is used to find various resources while worker starting up.
     The content of resources file should be formatted like <code>
     [[{"id":{"componentName": "spark.worker","resourceName":"gpu"},"addresses":["0","1","2"]}]]</code>.
-    When <code>spark.resources.coordinate.enable</code> is off, resources file must assign different
+    When <code>spark.resources.coordinateResourcesInStandalone</code> is off, resources file must assign different
     resources for workers and drivers in client mode that run on the same host to avoid resource conflict.
     If a particular resource is not found in the resources file, the discovery script would be used to
     find that resource. If the discovery script also does not find the resources, the worker will fail
@@ -346,9 +346,9 @@ Please make sure to have read the Custom Resource Scheduling and Configuration O
 
 Spark Standalone has 2 parts, the first is configuring the resources for the Worker, the second is the resource allocation for a specific application.
 
-The user must configure the Workers to have a set of resources available so that it can assign them out to Executors. The <code>spark.worker.resource.{resourceName}.amount</code> is used to control the amount of each resource the worker has allocated. The user must also specify either <code>spark.worker.resourcesFile</code> or <code>spark.worker.resource.{resourceName}.discoveryScript</code> to specify how the Worker discovers the resources its assigned. See the descriptions above for each of those to see which method works best for your setup. Please take note of <code>spark.resources.coordinate.enable</code> as it indicates whether Spark should handle coordinating resources or if the user has made sure each Worker has separate resources. Also note that if using the resources coordination <code>spark.resources.dir</code> can be used to specify the directory used to do that coordination.
+The user must configure the Workers to have a set of resources available so that it can assign them out to Executors. The <code>spark.worker.resource.{resourceName}.amount</code> is used to control the amount of each resource the worker has allocated. The user must also specify either <code>spark.worker.resourcesFile</code> or <code>spark.worker.resource.{resourceName}.discoveryScript</code> to specify how the Worker discovers the resources its assigned. See the descriptions above for each of those to see which method works best for your setup. Please take note of <code>spark.resources.coordinateResourcesInStandalone</code> as it indicates whether Spark should handle coordinating resources or if the user has made sure each Worker has separate resources. Also note that if using the resources coordination <code>spark.resources.dir</code> can be used to specify the directory used to do that coordination.
 
-The second part is running an application on Spark Standalone. The only special case from the standard Spark resource configs is when you are running the Driver in client mode. For a Driver in client mode, the user can specify the resources it uses via <code>spark.driver.resourcesfile</code> or <code>spark.driver.resources.{resourceName}.discoveryScript</code>. If the Driver is running on the same host as other Drivers or Workers there are 2 ways to make sure the they don't use the same resources. The user can either configure <code>spark.resources.coordinate.enable</code> on and give all the Driver/Workers the same set or resources and Spark will handle make sure each Driver/Worker has separate resources, or the user can make sure the resources file or discovery script only returns resources the do not conflict with other Drivers or Workers running on the same node.
+The second part is running an application on Spark Standalone. The only special case from the standard Spark resource configs is when you are running the Driver in client mode. For a Driver in client mode, the user can specify the resources it uses via <code>spark.driver.resourcesfile</code> or <code>spark.driver.resources.{resourceName}.discoveryScript</code>. If the Driver is running on the same host as other Drivers or Workers there are 2 ways to make sure the they don't use the same resources. The user can either configure <code>spark.resources.coordinateResourcesInStandalone</code> on and give all the Driver/Workers the same set or resources and Spark will handle make sure each Driver/Worker has separate resources, or the user can make sure the resources file or discovery script only returns resources the do not conflict with other Drivers or Workers running on the same node.
 
 Note, the user does not need to specify a discovery script when submitting an application as the Worker will start each Executor with the resources it allocates to it.
 
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 54a1fbf2cac8e..76df66b9de50c 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -97,7 +97,7 @@ license: |
 
   - Since Spark 3.0, when Avro files are written with user provided non-nullable schema, even the catalyst schema is nullable, Spark is still able to write the files. However, Spark will throw runtime NPE if any of the records contains null.
 
-  - Since Spark 3.0, a higher-order function `exists` follows the three-valued boolean logic, i.e., if the `predicate` returns any `null`s and no `true` is obtained, then `exists` will return `null` instead of `false`. For example, `exists(array(1, null, 3), x -> x % 2 == 0)` will be `null`. The previous behaviour can be restored by setting `spark.sql.legacy.arrayExistsFollowsThreeValuedLogic` to `false`.
+  - Since Spark 3.0, a higher-order function `exists` follows the three-valued boolean logic, i.e., if the `predicate` returns any `null`s and no `true` is obtained, then `exists` will return `null` instead of `false`. For example, `exists(array(1, null, 3), x -> x % 2 == 0)` will be `null`. The previous behaviour can be restored by setting `spark.sql.legacy.followThreeValuedLogicInArrayExists` to `false`.
 
   - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time.
 
@@ -109,7 +109,7 @@ license: |
 
   - The result of `java.lang.Math`'s `log`, `log1p`, `exp`, `expm1`, and `pow` may vary across platforms. In Spark 3.0, the result of the equivalent SQL functions (including related SQL functions like `LOG10`) return values consistent with `java.lang.StrictMath`. In virtually all cases this makes no difference in the return value, and the difference is very small, but may not exactly match `java.lang.Math` on x86 platforms in cases like, for example, `log(3.0)`, whose value varies between `Math.log()` and `StrictMath.log()`.
 
-  - Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin.enabled` to `false`.
+  - Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin` to `false`.
 
   - Since Spark 3.0, `Cast` function processes string literals such as 'Infinity', '+Infinity', '-Infinity', 'NaN', 'Inf', '+Inf', '-Inf' in case insensitive manner when casting the literals to `Double` or `Float` type to ensure greater compatibility with other database systems. This behaviour change is illustrated in the table below:
     <table class="table">
@@ -258,13 +258,13 @@ license: |
 
   - Since Spark 3.0, day-time interval strings are converted to intervals with respect to the `from` and `to` bounds. If an input string does not match to the pattern defined by specified bounds, the `ParseException` exception is thrown. For example, `interval '2 10:20' hour to minute` raises the exception because the expected format is `[+|-]h[h]:[m]m`. In Spark version 2.4, the `from` bound was not taken into account, and the `to` bound was used to truncate the resulted interval. For instance, the day-time interval string from the showed example is converted to `interval 10 hours 20 minutes`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.fromDayTimeString.enabled` to `true`.
   
-  - Since Spark 3.0, negative scale of decimal is not allowed by default, e.g. data type of literal like `1E10BD` is `DecimalType(11, 0)`. In Spark version 2.4 and earlier, it was `DecimalType(2, -9)`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.allowNegativeScaleOfDecimal.enabled` to `true`.
+  - Since Spark 3.0, negative scale of decimal is not allowed by default, e.g. data type of literal like `1E10BD` is `DecimalType(11, 0)`. In Spark version 2.4 and earlier, it was `DecimalType(2, -9)`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.allowNegativeScaleOfDecimal` to `true`.
 
   - Since Spark 3.0, the `date_add` and `date_sub` functions only accepts int, smallint, tinyint as the 2nd argument, fractional and string types are not valid anymore, e.g. `date_add(cast('1964-05-23' as date), '12.34')` will cause `AnalysisException`. In Spark version 2.4 and earlier, if the 2nd argument is fractional or string value, it will be coerced to int value, and the result will be a date value of `1964-06-04`.
 
   - Since Spark 3.0, the function `percentile_approx` and its alias `approx_percentile` only accept integral value with range in `[1, 2147483647]` as its 3rd argument `accuracy`, fractional and string types are disallowed, e.g. `percentile_approx(10.0, 0.2, 1.8D)` will cause `AnalysisException`. In Spark version 2.4 and earlier, if `accuracy` is fractional or string value, it will be coerced to an int value, `percentile_approx(10.0, 0.2, 1.8D)` is operated as `percentile_approx(10.0, 0.2, 1)` which results in `10.0`.
 
-  - Since Spark 3.0, the properties listing below become reserved, commands will fail if we specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. We need their specific clauses to specify them, e.g. `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. We can set `spark.sql.legacy.property.nonReserved` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, e.g `SET DBPROTERTIES('location'='/tmp')` will affect nothing. In Spark version 2.4 and earlier, these properties are neither reserved nor have side effects, e.g. `SET DBPROTERTIES('location'='/tmp')` will not change the location of the database but only create a headless property just like `'a'='b'`.
+  - Since Spark 3.0, the properties listing below become reserved, commands will fail if we specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. We need their specific clauses to specify them, e.g. `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. We can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, e.g `SET DBPROTERTIES('location'='/tmp')` will affect nothing. In Spark version 2.4 and earlier, these properties are neither reserved nor have side effects, e.g. `SET DBPROTERTIES('location'='/tmp')` will not change the location of the database but only create a headless property just like `'a'='b'`.
     <table class="table">
         <tr>
           <th>
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 6f46e92f5a2a4..4dd15d14b9c53 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -161,7 +161,7 @@ def create_array(s, t):
                             "Array (%s). It can be caused by overflows or other unsafe " + \
                             "conversions warned by Arrow. Arrow safe type check can be " + \
                             "disabled by using SQL config " + \
-                            "`spark.sql.execution.pandas.arrowSafeTypeConversion`."
+                            "`spark.sql.execution.pandas.convertToArrowArraySafely`."
                 raise RuntimeError(error_msg % (s.dtype, t), e)
             return array
 
diff --git a/python/pyspark/sql/tests/test_pandas_udf.py b/python/pyspark/sql/tests/test_pandas_udf.py
index 94801a16ce7cd..4218f5cfc401f 100644
--- a/python/pyspark/sql/tests/test_pandas_udf.py
+++ b/python/pyspark/sql/tests/test_pandas_udf.py
@@ -211,14 +211,14 @@ def udf(column):
 
         # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
         with self.sql_conf({
-                "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
+                "spark.sql.execution.pandas.convertToArrowArraySafely": True}):
             with self.assertRaisesRegexp(Exception,
                                          "Exception thrown when converting pandas.Series"):
                 df.select(['A']).withColumn('udf', udf('A')).collect()
 
         # Disabling Arrow safe type check.
         with self.sql_conf({
-                "spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
+                "spark.sql.execution.pandas.convertToArrowArraySafely": False}):
             df.select(['A']).withColumn('udf', udf('A')).collect()
 
     def test_pandas_udf_arrow_overflow(self):
@@ -232,13 +232,13 @@ def udf(column):
 
         # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
         with self.sql_conf({
-                "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
+                "spark.sql.execution.pandas.convertToArrowArraySafely": True}):
             with self.assertRaisesRegexp(Exception,
                                          "Exception thrown when converting pandas.Series"):
                 df.withColumn('udf', udf('id')).collect()
 
         # Disabling safe type check, let Arrow do the cast anyway.
-        with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
+        with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
             df.withColumn('udf', udf('id')).collect()
 
 
diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
index 37ce62b7f7159..81402f52af3b3 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -205,13 +205,13 @@ def test_create_dataframe_from_dict_respects_schema(self):
 
     def test_negative_decimal(self):
         try:
-            self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal.enabled=true")
+            self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=true")
             df = self.spark.createDataFrame([(1, ), (11, )], ["value"])
             ret = df.select(col("value").cast(DecimalType(1, -1))).collect()
             actual = list(map(lambda r: int(r.value), ret))
             self.assertEqual(actual, [0, 10])
         finally:
-            self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal.enabled=false")
+            self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=false")
 
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 5d498421e259d..7d62bf1783931 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -306,7 +306,7 @@ def read_udfs(pickleSer, infile, eval_type):
 
         # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
         timezone = runner_conf.get("spark.sql.session.timeZone", None)
-        safecheck = runner_conf.get("spark.sql.execution.pandas.arrowSafeTypeConversion",
+        safecheck = runner_conf.get("spark.sql.execution.pandas.convertToArrowArraySafely",
                                     "false").lower() == 'true'
         # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
         assign_cols_by_name = runner_conf.get(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 700f6b773727b..8c5debd28ce56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -368,14 +368,14 @@ object SQLConf {
     .createWithDefault(false)
 
   val REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED =
-    buildConf("spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled")
+    buildConf("spark.sql.adaptive.shuffle.reducePostShufflePartitions")
       .doc(s"When true and '${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled, this enables reducing " +
         "the number of post-shuffle partitions based on map output statistics.")
       .booleanConf
       .createWithDefault(true)
 
   val FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED =
-    buildConf("spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch.enabled")
+    buildConf("spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch")
       .doc("Whether to fetch the continuous shuffle blocks in batch. Instead of fetching blocks " +
         "one by one, fetching continuous shuffle blocks for the same map task in batch can " +
         "reduce IO and improve performance. Note, multiple continuous blocks exist in single " +
@@ -425,7 +425,7 @@ object SQLConf {
     .createWithDefault(true)
 
   val ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED =
-    buildConf("spark.sql.adaptive.optimizeSkewedJoin.enabled")
+    buildConf("spark.sql.adaptive.skewedJoinOptimization.enabled")
     .doc("When true and adaptive execution is enabled, a skewed join is automatically handled at " +
       "runtime.")
     .booleanConf
@@ -894,7 +894,7 @@ object SQLConf {
       .createWithDefault(10000)
 
   val IGNORE_DATA_LOCALITY =
-    buildConf("spark.sql.sources.ignoreDataLocality.enabled")
+    buildConf("spark.sql.sources.ignoreDataLocality")
       .doc("If true, Spark will not fetch the block locations for each file on " +
         "listing files. This speeds up file listing, but the scheduler cannot " +
         "schedule tasks to take advantage of data locality. It can be particularly " +
@@ -913,7 +913,7 @@ object SQLConf {
       .createWithDefault(true)
 
   val FAIL_AMBIGUOUS_SELF_JOIN_ENABLED =
-    buildConf("spark.sql.analyzer.failAmbiguousSelfJoin.enabled")
+    buildConf("spark.sql.analyzer.failAmbiguousSelfJoin")
       .doc("When true, fail the Dataset query if it contains ambiguous self-join.")
       .internal()
       .booleanConf
@@ -1062,7 +1062,7 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
-  val SUBQUERY_REUSE_ENABLED = buildConf("spark.sql.execution.subquery.reuse.enabled")
+  val SUBQUERY_REUSE_ENABLED = buildConf("spark.sql.execution.reuseSubquery")
     .internal()
     .doc("When true, the planner will try to find out duplicated subqueries and re-use them.")
     .booleanConf
@@ -1102,7 +1102,7 @@ object SQLConf {
     .createOptional
 
   val FORCE_DELETE_TEMP_CHECKPOINT_LOCATION =
-    buildConf("spark.sql.streaming.forceDeleteTempCheckpointLocation.enabled")
+    buildConf("spark.sql.streaming.forceDeleteTempCheckpointLocation")
       .doc("When true, enable temporary checkpoint locations force delete.")
       .booleanConf
       .createWithDefault(false)
@@ -1629,7 +1629,7 @@ object SQLConf {
       .createWithDefault(true)
 
   val PANDAS_ARROW_SAFE_TYPE_CONVERSION =
-    buildConf("spark.sql.execution.pandas.arrowSafeTypeConversion")
+    buildConf("spark.sql.execution.pandas.convertToArrowArraySafely")
       .internal()
       .doc("When true, Arrow will perform safe type conversion when converting " +
         "Pandas.Series to Arrow array during serialization. Arrow will raise errors " +
@@ -1959,7 +1959,7 @@ object SQLConf {
       .createWithDefault(false)
 
   val LEGACY_ALLOW_NEGATIVE_SCALE_OF_DECIMAL_ENABLED =
-    buildConf("spark.sql.legacy.allowNegativeScaleOfDecimal.enabled")
+    buildConf("spark.sql.legacy.allowNegativeScaleOfDecimal")
       .internal()
       .doc("When set to true, negative scale of Decimal type is allowed. For example, " +
         "the type of number 1E10BD under legacy mode is DecimalType(2, -9), but is " +
@@ -2100,7 +2100,7 @@ object SQLConf {
       .stringConf
       .createOptional
 
-  val LEGACY_LOOSE_UPCAST = buildConf("spark.sql.legacy.looseUpcast")
+  val LEGACY_LOOSE_UPCAST = buildConf("spark.sql.legacy.doLooseUpcast")
     .internal()
     .doc("When true, the upcast will be loose and allows string to atomic types.")
     .booleanConf
@@ -2122,7 +2122,7 @@ object SQLConf {
     .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString)
 
   val LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC =
-    buildConf("spark.sql.legacy.arrayExistsFollowsThreeValuedLogic")
+    buildConf("spark.sql.legacy.followThreeValuedLogicInArrayExists")
       .internal()
       .doc("When true, the ArrayExists will follow the three-valued boolean logic.")
       .booleanConf
@@ -2149,7 +2149,7 @@ object SQLConf {
       .createWithDefault(false)
 
   val LEGACY_PROPERTY_NON_RESERVED =
-    buildConf("spark.sql.legacy.property.nonReserved")
+    buildConf("spark.sql.legacy.notReserveProperties")
       .internal()
       .doc("When true, all database and table properties are not reserved and available for " +
         "create/alter syntaxes. But please be aware that the reserved properties will be " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 05069e24e2329..360a14060c15d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -159,7 +159,7 @@ object DecimalType extends AbstractDataType {
   private[sql] def checkNegativeScale(scale: Int): Unit = {
     if (scale < 0 && !SQLConf.get.allowNegativeScaleOfDecimalEnabled) {
       throw new AnalysisException(s"Negative scale is not allowed: $scale. " +
-        s"You can use spark.sql.legacy.allowNegativeScaleOfDecimal.enabled=true " +
+        s"You can use spark.sql.legacy.allowNegativeScaleOfDecimal=true " +
         s"to enable legacy mode to allow it.")
     }
   }

From 1b67d546bd96412943de0c7a3b4295cbde887bd2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 18 Feb 2020 21:19:57 +0100
Subject: [PATCH 087/185] revert SPARK-29663 and SPARK-29688

### What changes were proposed in this pull request?

This PR reverts https://github.com/apache/spark/pull/26325 and https://github.com/apache/spark/pull/26347

### Why are the changes needed?

When we do sum/avg, we need a wider type of input to hold the sum value, to reduce the possibility of overflow. For example, we use long to hold the sum of integral inputs, use double to hold the sum of float/double.

However, we don't have a wider type of interval. Also the semantic is unclear: what if the days field overflows but the months field doesn't? Currently the avg of `1 month` and `2 month` is `1 month 15 days`, which assumes 1 month has 30 days and we should avoid this assumption.

### Does this PR introduce any user-facing change?

yes, remove 2 features added in 3.0

### How was this patch tested?

N/A

Closes #27619 from cloud-fan/revert.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: herman <herman@databricks.com>
---
 .../expressions/aggregate/Average.scala       |  15 +-
 .../catalyst/expressions/aggregate/Sum.scala  |  14 +-
 .../ExpressionTypeCheckingSuite.scala         |   4 +-
 .../resources/sql-tests/inputs/interval.sql   |  64 -------
 .../sql-tests/results/ansi/interval.sql.out   | 176 +-----------------
 .../sql-tests/results/interval.sql.out        | 176 +-----------------
 .../spark/sql/DataFrameAggregateSuite.scala   |  13 --
 7 files changed, 17 insertions(+), 445 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index 996c548e1329c..17f906c698de2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
-import org.apache.spark.sql.catalyst.analysis.{DecimalPrecision, FunctionRegistry}
+import org.apache.spark.sql.catalyst.analysis.{DecimalPrecision, FunctionRegistry, TypeCheckResult}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
@@ -30,8 +31,6 @@ import org.apache.spark.sql.types._
        2.0
       > SELECT _FUNC_(col) FROM VALUES (1), (2), (NULL) AS tab(col);
        1.5
-      > SELECT _FUNC_(cast(v as interval)) FROM VALUES ('-1 weeks'), ('2 seconds'), (null) t(v);
-       -3 days -11 hours -59 minutes -59 seconds
   """,
   since = "1.0.0")
 case class Average(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
@@ -40,7 +39,10 @@ case class Average(child: Expression) extends DeclarativeAggregate with Implicit
 
   override def children: Seq[Expression] = child :: Nil
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndInterval)
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function average")
 
   override def nullable: Boolean = true
 
@@ -50,13 +52,11 @@ case class Average(child: Expression) extends DeclarativeAggregate with Implicit
   private lazy val resultType = child.dataType match {
     case DecimalType.Fixed(p, s) =>
       DecimalType.bounded(p + 4, s + 4)
-    case interval: CalendarIntervalType => interval
     case _ => DoubleType
   }
 
   private lazy val sumDataType = child.dataType match {
     case _ @ DecimalType.Fixed(p, s) => DecimalType.bounded(p + 10, s)
-    case interval: CalendarIntervalType => interval
     case _ => DoubleType
   }
 
@@ -79,9 +79,6 @@ case class Average(child: Expression) extends DeclarativeAggregate with Implicit
   override lazy val evaluateExpression = child.dataType match {
     case _: DecimalType =>
       DecimalPrecision.decimalAndDecimal(sum / count.cast(DecimalType.LongDecimal)).cast(resultType)
-    case CalendarIntervalType =>
-      val newCount = If(EqualTo(count, Literal(0L)), Literal(null, LongType), count)
-      DivideInterval(sum.cast(resultType), newCount.cast(DoubleType))
     case _ =>
       sum.cast(resultType) / count.cast(resultType)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
index 87f1a4f02e4fc..8bfd889ea0563 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
-// scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the sum calculated from values of a group.",
   examples = """
@@ -33,11 +34,8 @@ import org.apache.spark.sql.types._
        25
       > SELECT _FUNC_(col) FROM VALUES (NULL), (NULL) AS tab(col);
        NULL
-      > SELECT _FUNC_(cast(col as interval)) FROM VALUES ('1 seconds'), ('2 seconds'), (null) tab(col);
-       3 seconds
   """,
   since = "1.0.0")
-// scalastyle:on line.size.limit
 case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = child :: Nil
@@ -47,12 +45,14 @@ case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCast
   // Return data type.
   override def dataType: DataType = resultType
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndInterval)
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function sum")
 
   private lazy val resultType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
       DecimalType.bounded(precision + 10, scale)
-    case _: CalendarIntervalType => CalendarIntervalType
     case _: IntegralType => LongType
     case _ => DoubleType
   }
@@ -61,7 +61,7 @@ case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCast
 
   private lazy val sum = AttributeReference("sum", sumDataType)()
 
-  private lazy val zero = Literal.default(resultType)
+  private lazy val zero = Literal.default(sumDataType)
 
   override lazy val aggBufferAttributes = sum :: Nil
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 86a1f1fb58a07..46634c93148b1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -158,8 +158,8 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
     assertError(Min(Symbol("mapField")), "min does not support ordering on type")
     assertError(Max(Symbol("mapField")), "max does not support ordering on type")
-    assertError(Sum(Symbol("booleanField")), "requires (numeric or interval) type")
-    assertError(Average(Symbol("booleanField")), "requires (numeric or interval) type")
+    assertError(Sum(Symbol("booleanField")), "function sum requires numeric type")
+    assertError(Average(Symbol("booleanField")), "function average requires numeric type")
   }
 
   test("check types for others") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
index a4e621e9639d4..facd6321a1bd4 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
@@ -84,70 +84,6 @@ select interval (-30) day;
 select interval (a + 1) day;
 select interval 30 day day day;
 
--- sum interval values
--- null
-select sum(cast(null as interval));
-
--- empty set
-select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0;
-
--- basic interval sum
-select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v);
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v);
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v);
-select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v);
-
--- group by
-select
-    i,
-    sum(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i;
-
--- having
-select
-    sum(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null;
-
--- window
-SELECT
-    i,
-    sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v);
-
--- average with interval type
--- null
-select avg(cast(v as interval)) from VALUES (null) t(v);
-
--- empty set
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0;
-
--- basic interval avg
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v);
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v);
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v);
-select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v);
-
--- group by
-select
-    i,
-    avg(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i;
-
--- having
-select
-    avg(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null;
-
--- window
-SELECT
-    i,
-    avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v);
-
 -- Interval year-month arithmetic
 
 create temporary view interval_arithmetic as
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index 7fdb4c53d1dcb..4b465406ee0ed 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 99
+-- Number of queries: 81
 
 
 -- !query
@@ -631,180 +631,6 @@ select interval 30 day day day
 -----------------------^^^
 
 
--- !query
-select sum(cast(null as interval))
--- !query schema
-struct<sum(CAST(NULL AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-3 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-1 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
--3 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
--7 days 2 seconds
-
-
--- !query
-select
-    i,
-    sum(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i
--- !query schema
-struct<i:int,sum(CAST(v AS INTERVAL)):interval>
--- !query output
-1	-2 days
-2	2 seconds
-3	NULL
-
-
--- !query
-select
-    sum(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null
--- !query schema
-struct<sv:interval>
--- !query output
--2 days 2 seconds
-
-
--- !query
-SELECT
-    i,
-    sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v)
--- !query schema
-struct<i:int,sum(CAST(v AS INTERVAL)) OVER (ORDER BY i ASC NULLS FIRST ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING):interval>
--- !query output
-1	2 seconds
-1	3 seconds
-2	NULL
-2	NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-1.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-0.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
--1.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
--3 days -11 hours -59 minutes -59 seconds
-
-
--- !query
-select
-    i,
-    avg(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i
--- !query schema
-struct<i:int,avg(CAST(v AS INTERVAL)):interval>
--- !query output
-1	-1 days
-2	2 seconds
-3	NULL
-
-
--- !query
-select
-    avg(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null
--- !query schema
-struct<sv:interval>
--- !query output
--15 hours -59 minutes -59.333333 seconds
-
-
--- !query
-SELECT
-    i,
-    avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v)
--- !query schema
-struct<i:int,avg(CAST(v AS INTERVAL)) OVER (ORDER BY i ASC NULLS FIRST ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING):interval>
--- !query output
-1	1.5 seconds
-1	2 seconds
-2	NULL
-2	NULL
-
-
 -- !query
 create temporary view interval_arithmetic as
   select CAST(dateval AS date), CAST(tsval AS timestamp) from values
diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
index 3c4b4301d0025..0509594ac13d3 100644
--- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 99
+-- Number of queries: 81
 
 
 -- !query
@@ -615,180 +615,6 @@ select interval 30 day day day
 ---------------------------^^^
 
 
--- !query
-select sum(cast(null as interval))
--- !query schema
-struct<sum(CAST(NULL AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-3 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
-1 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
--3 seconds
-
-
--- !query
-select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<sum(CAST(v AS INTERVAL)):interval>
--- !query output
--7 days 2 seconds
-
-
--- !query
-select
-    i,
-    sum(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i
--- !query schema
-struct<i:int,sum(CAST(v AS INTERVAL)):interval>
--- !query output
-1	-2 days
-2	2 seconds
-3	NULL
-
-
--- !query
-select
-    sum(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null
--- !query schema
-struct<sv:interval>
--- !query output
--2 days 2 seconds
-
-
--- !query
-SELECT
-    i,
-    sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v)
--- !query schema
-struct<i:int,sum(CAST(v AS INTERVAL)) OVER (ORDER BY i ASC NULLS FIRST ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING):interval>
--- !query output
-1	2 seconds
-1	3 seconds
-2	NULL
-2	NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-NULL
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-1.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
-0.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
--1.5 seconds
-
-
--- !query
-select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v)
--- !query schema
-struct<avg(CAST(v AS INTERVAL)):interval>
--- !query output
--3 days -11 hours -59 minutes -59 seconds
-
-
--- !query
-select
-    i,
-    avg(cast(v as interval))
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-group by i
--- !query schema
-struct<i:int,avg(CAST(v AS INTERVAL)):interval>
--- !query output
-1	-1 days
-2	2 seconds
-3	NULL
-
-
--- !query
-select
-    avg(cast(v as interval)) as sv
-from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v)
-having sv is not null
--- !query schema
-struct<sv:interval>
--- !query output
--15 hours -59 minutes -59.333333 seconds
-
-
--- !query
-SELECT
-    i,
-    avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
-FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v)
--- !query schema
-struct<i:int,avg(CAST(v AS INTERVAL)) OVER (ORDER BY i ASC NULLS FIRST ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING):interval>
--- !query output
-1	1.5 seconds
-1	2 seconds
-2	NULL
-2	NULL
-
-
 -- !query
 create temporary view interval_arithmetic as
   select CAST(dateval AS date), CAST(tsval AS timestamp) from values
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index d7df75fd0e2c3..288f3dac36621 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -957,17 +957,4 @@ class DataFrameAggregateSuite extends QueryTest
       assert(error.message.contains("function count_if requires boolean type"))
     }
   }
-
-  test("calendar interval agg support hash aggregate") {
-    val df1 = Seq((1, "1 day"), (2, "2 day"), (3, "3 day"), (3, null)).toDF("a", "b")
-    val df2 = df1.select(avg($"b" cast CalendarIntervalType))
-    checkAnswer(df2, Row(new CalendarInterval(0, 2, 0)) :: Nil)
-    assert(find(df2.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined)
-    val df3 = df1.groupBy($"a").agg(avg($"b" cast CalendarIntervalType))
-    checkAnswer(df3,
-      Row(1, new CalendarInterval(0, 1, 0)) ::
-        Row(2, new CalendarInterval(0, 2, 0)) ::
-        Row(3, new CalendarInterval(0, 3, 0)) :: Nil)
-    assert(find(df3.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined)
-  }
 }

From e065e22e5e4d3dcf334e9ad57675f31306281c90 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Wed, 19 Feb 2020 11:17:47 +0900
Subject: [PATCH 088/185] [SPARK-30861][PYTHON][SQL] Deprecate constructor of
 SQLContext and getOrCreate in SQLContext at PySpark

### What changes were proposed in this pull request?

This PR proposes to deprecate the APIs at `SQLContext` removed in SPARK-25908. We should remove equivalent APIs; however, seems we missed to deprecate.

While I am here, I fix one more issue. After SPARK-25908, `sc._jvm.SQLContext.getOrCreate` dose not exist anymore. So,

```python
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
SQLContext.getOrCreate(sc).range(10).show()
```

throws an exception as below:

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../spark/python/pyspark/sql/context.py", line 110, in getOrCreate
    jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
  File "/.../spark/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1516, in __getattr__
py4j.protocol.Py4JError: org.apache.spark.sql.SQLContext.getOrCreate does not exist in the JVM
```

After this PR:

```
/.../spark/python/pyspark/sql/context.py:113: DeprecationWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  DeprecationWarning)
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+
```

In case of the constructor of `SQLContext`, after this PR:

```python
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
SQLContext(sc)
```

```
/.../spark/python/pyspark/sql/context.py:77: DeprecationWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  DeprecationWarning)
```

### Why are the changes needed?

To promote to use SparkSession, and keep the API party consistent with Scala side.

### Does this PR introduce any user-facing change?

Yes, it will show deprecation warning to users.

### How was this patch tested?

Manually tested as described above. Unittests were also added.

Closes #27614 from HyukjinKwon/SPARK-30861.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/sql/context.py            | 15 ++++++++++++++-
 python/pyspark/sql/tests/test_context.py | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 68d5ef4ca7b15..f203e1cb55588 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -56,6 +56,8 @@ class SQLContext(object):
     def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
         """Creates a new SQLContext.
 
+        .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead.
+
         >>> from datetime import datetime
         >>> sqlContext = SQLContext(sc)
         >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
@@ -70,6 +72,10 @@ def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
         >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
         [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
         """
+        warnings.warn(
+            "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
+            DeprecationWarning)
+
         self._sc = sparkContext
         self._jsc = self._sc._jsc
         self._jvm = self._sc._jvm
@@ -105,9 +111,16 @@ def getOrCreate(cls, sc):
         Get the existing SQLContext or create a new one with given SparkContext.
 
         :param sc: SparkContext
+
+        .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead.
         """
+        warnings.warn(
+            "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
+            DeprecationWarning)
+
         if cls._instantiatedContext is None:
-            jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
+            jsqlContext = sc._jvm.SparkSession.builder().sparkContext(
+                sc._jsc.sc()).getOrCreate().sqlContext()
             sparkSession = SparkSession(sc, jsqlContext.sparkSession())
             cls(sc, sparkSession, jsqlContext)
         return cls._instantiatedContext
diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
index d57ebc48cf5e6..92e54340f8143 100644
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@@ -27,6 +27,7 @@
 
 import py4j
 
+from pyspark import SparkContext, SQLContext
 from pyspark.sql import Row, SparkSession
 from pyspark.sql.types import *
 from pyspark.sql.window import Window
@@ -259,6 +260,23 @@ def range_frame_match():
         reload(window)
 
 
+class SQLContextTests(unittest.TestCase):
+
+    def test_get_or_create(self):
+        sc = None
+        sql_context = None
+        try:
+            sc = SparkContext('local[4]', "SQLContextTests")
+            sql_context = SQLContext.getOrCreate(sc)
+            assert(isinstance(sql_context, SQLContext))
+        finally:
+            SQLContext._instantiatedContext = None
+            if sql_context is not None:
+                sql_context.sparkSession.stop()
+            if sc is not None:
+                sc.stop()
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests.test_context import *
 

From 0894dbab2cd112b0f8f873cdb4dc510d92eac4fb Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Wed, 19 Feb 2020 14:26:27 +0800
Subject: [PATCH 089/185] [MINOR][SQL] Improve readability for window execution

### What changes were proposed in this pull request?
I read the comments of `WindowExec` and found some comment will cause confusion and another need to improve.

### Why are the changes needed?
This PR will enhance the readability and let developer works more easy

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
No need

Closes #27431 from beliefer/improve-window-readability.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/window/WindowExec.scala      | 18 ++++++++++++++----
 .../sql/execution/window/WindowExecBase.scala  |  6 +++---
 .../execution/window/WindowFunctionFrame.scala |  4 ++--
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index d191f3790ffa8..42fa07f4a6aee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -38,15 +38,25 @@ import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType,
  * - Entire partition: The frame is the entire partition, i.e.
  *   UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. For this case, window function will take all
  *   rows as inputs and be evaluated once.
- * - Growing frame: We only add new rows into the frame, i.e. UNBOUNDED PRECEDING AND ....
+ * - Growing frame: We only add new rows into the frame, Examples are:
+ *     1. UNBOUNDED PRECEDING AND 1 PRECEDING
+ *     2. UNBOUNDED PRECEDING AND CURRENT ROW
+ *     3. UNBOUNDED PRECEDING AND 1 FOLLOWING
  *   Every time we move to a new row to process, we add some rows to the frame. We do not remove
  *   rows from this frame.
- * - Shrinking frame: We only remove rows from the frame, i.e. ... AND UNBOUNDED FOLLOWING.
+ * - Shrinking frame: We only remove rows from the frame, Examples are:
+ *     1. 1 PRECEDING AND UNBOUNDED FOLLOWING
+ *     2. CURRENT ROW AND UNBOUNDED FOLLOWING
+ *     3. 1 FOLLOWING AND UNBOUNDED FOLLOWING
  *   Every time we move to a new row to process, we remove some rows from the frame. We do not add
  *   rows to this frame.
  * - Moving frame: Every time we move to a new row to process, we remove some rows from the frame
  *   and we add some rows to the frame. Examples are:
- *     1 PRECEDING AND CURRENT ROW and 1 FOLLOWING AND 2 FOLLOWING.
+ *     1. 2 PRECEDING AND 1 PRECEDING
+ *     2. 1 PRECEDING AND CURRENT ROW
+ *     3. CURRENT ROW AND 1 FOLLOWING
+ *     4. 1 PRECEDING AND 1 FOLLOWING
+ *     5. 1 FOLLOWING AND 2 FOLLOWING
  * - Offset frame: The frame consist of one row, which is an offset number of rows away from the
  *   current row. Only [[OffsetWindowFunction]]s can be processed in an offset frame.
  *
@@ -105,7 +115,7 @@ case class WindowExec(
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
   protected override def doExecute(): RDD[InternalRow] = {
-    // Unwrap the expressions and factories from the map.
+    // Unwrap the window expressions and window frame factories from the map.
     val expressions = windowFrameExpressionFactoryPairs.flatMap(_._1)
     val factories = windowFrameExpressionFactoryPairs.map(_._2).toArray
     val inMemoryThreshold = sqlContext.conf.windowExecBufferInMemoryThreshold
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
index d5d11c45f8535..99c8917478865 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
@@ -114,7 +114,7 @@ abstract class WindowExecBase(
 
   /**
    * Collection containing an entry for each window frame to process. Each entry contains a frame's
-   * [[WindowExpression]]s and factory function for the WindowFrameFunction.
+   * [[WindowExpression]]s and factory function for the [[WindowFrameFunction]].
    */
   protected lazy val windowFrameExpressionFactoryPairs = {
     type FrameKey = (String, FrameType, Expression, Expression)
@@ -170,7 +170,7 @@ abstract class WindowExecBase(
               MutableProjection.create(expressions, schema))
         }
 
-        // Create the factory
+        // Create the factory to produce WindowFunctionFrame.
         val factory = key match {
           // Offset Frame
           case ("OFFSET", _, IntegerLiteral(offset), _) =>
@@ -223,7 +223,7 @@ abstract class WindowExecBase(
         // Keep track of the number of expressions. This is a side-effect in a map...
         numExpressions += expressions.size
 
-        // Create the Frame Expression - Factory pair.
+        // Create the Window Expression - Frame Factory pair.
         (expressions, factory)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
index 181ff42421c65..dc1b919feefe4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
@@ -47,14 +47,14 @@ abstract class WindowFunctionFrame {
   /**
    * The current lower window bound in the row array (inclusive).
    *
-   * This should be called after the current row is updated via [[write]]
+   * This should be called after the current row is updated via `write`.
    */
   def currentLowerBound(): Int
 
   /**
    * The current row index of the upper window bound in the row array (exclusive)
    *
-   * This should be called after the current row is updated via [[write]]
+   * This should be called after the current row is updated via `write`.
    */
   def currentUpperBound(): Int
 }

From c0715221b23d5461b5498a9b832921291359b4b7 Mon Sep 17 00:00:00 2001
From: LantaoJin <jinlantao@gmail.com>
Date: Wed, 19 Feb 2020 15:05:34 +0800
Subject: [PATCH 090/185] [SPARK-30785][SQL] Create table like should keep
 tracksPartitionsInCatalog same with source table

### What changes were proposed in this pull request?
Table generated by `CREATE TABLE LIKE` a partitioned table is a partitioned table. But when run `ALTER TABLE ADD PARTITION`, it will throw `AnalysisException: ALTER TABLE ADD PARTITION is not allowed`. That's because the default value of `tracksPartitionsInCatalog` from `CREATE TABLE LIKE` always is false.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Add a unit test.

Closes #27538 from LantaoJin/SPARK-30785.

Authored-by: LantaoJin <jinlantao@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/command/tables.scala  |  3 ++-
 .../sql/hive/execution/HiveDDLSuite.scala     | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 61500b773cd7e..6243261d3cd16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -124,7 +124,8 @@ case class CreateTableLikeCommand(
         provider = newProvider,
         partitionColumnNames = sourceTableDesc.partitionColumnNames,
         bucketSpec = sourceTableDesc.bucketSpec,
-        properties = properties)
+        properties = properties,
+        tracksPartitionsInCatalog = sourceTableDesc.tracksPartitionsInCatalog)
 
     catalog.createTable(newTableDesc, ifNotExists)
     Seq.empty[Row]
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index ba48cfd4142f6..7adc68d548af2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1580,6 +1580,12 @@ class HiveDDLSuite
         "source table/view path should be different from target table path")
     }
 
+    if (DDLUtils.isHiveTable(targetTable)) {
+      assert(targetTable.tracksPartitionsInCatalog)
+    } else {
+      assert(targetTable.tracksPartitionsInCatalog == sourceTable.tracksPartitionsInCatalog)
+    }
+
     // The source table contents should not been seen in the target table.
     assert(spark.table(sourceTable.identifier).count() != 0, "the source table should be nonempty")
     assert(spark.table(targetTable.identifier).count() == 0, "the target table should be empty")
@@ -2724,4 +2730,19 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("SPARK-30785: create table like a partitioned table") {
+    val catalog = spark.sessionState.catalog
+    withTable("sc_part", "ta_part") {
+      sql("CREATE TABLE sc_part (key string, ts int) USING parquet PARTITIONED BY (ts)")
+      sql("CREATE TABLE ta_part like sc_part")
+      val sourceTable = catalog.getTableMetadata(TableIdentifier("sc_part", Some("default")))
+      val targetTable = catalog.getTableMetadata(TableIdentifier("ta_part", Some("default")))
+      assert(sourceTable.tracksPartitionsInCatalog)
+      assert(targetTable.tracksPartitionsInCatalog)
+      assert(targetTable.partitionColumnNames == Seq("ts"))
+      sql("ALTER TABLE ta_part ADD PARTITION (ts=10)") // no exception
+      checkAnswer(sql("SHOW PARTITIONS ta_part"), Row("ts=10") :: Nil)
+    }
+  }
 }

From 10a4eafcfe0e0dcf4959894ac2e8f936693f5d7c Mon Sep 17 00:00:00 2001
From: jiake <ke.a.jia@intel.com>
Date: Wed, 19 Feb 2020 15:55:29 +0800
Subject: [PATCH 091/185] [SPARK-30812][SQL] update the skew join configs by
 adding the prefix "skewedJoinOptimization"

### What changes were proposed in this pull request?
This is a follow up in [PR#27563](https://github.com/apache/spark/pull/27563).
This PR adds the prefix of "skewedJoinOptimization" in the skew join related configs.

### Why are the changes needed?
address remaining address

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
only update config and no need new ut.

Closes #27630 from JkSelf/renameskewjoinconfig.

Authored-by: jiake <ke.a.jia@intel.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8c5debd28ce56..e1492e646a627 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -432,14 +432,14 @@ object SQLConf {
     .createWithDefault(true)
 
   val ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD =
-    buildConf("spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionSizeThreshold")
+    buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionSizeThreshold")
       .doc("Configures the minimum size in bytes for a partition that is considered as a skewed " +
         "partition in adaptive skewed join.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefaultString("64MB")
 
   val ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR =
-    buildConf("spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionFactor")
+    buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionFactor")
       .doc("A partition is considered as a skewed partition if its size is larger than" +
         " this factor multiple the median partition size and also larger than " +
         s" ${ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key}")
@@ -447,7 +447,7 @@ object SQLConf {
       .createWithDefault(10)
 
   val ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS =
-    buildConf("spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionMaxSplits")
+    buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionMaxSplits")
       .doc("Configures the maximum number of task to handle a skewed partition in adaptive skewed" +
         "join.")
       .intConf

From 2ab8d674ba98db4068c4cdefdc4c424dd39c071c Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@liveramp.com>
Date: Wed, 19 Feb 2020 17:28:58 +0900
Subject: [PATCH 092/185] [SPARK-30731] Update deprecated Mkdocs option

Split from #27534.

### What changes were proposed in this pull request?

This PR updates a deprecated Mkdocs option to use the new name.

### Why are the changes needed?

This change will prevent the docs from failing to build when we update to a version of Mkdocs that no longer supports the deprecated option.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

I built the docs locally and reviewed them in my browser.

Closes #27626 from nchammas/SPARK-30731-mkdocs-dep-opt.

Authored-by: Nicholas Chammas <nicholas.chammas@liveramp.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 sql/mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml
index 8aad985e30459..4463e11f17d1f 100644
--- a/sql/mkdocs.yml
+++ b/sql/mkdocs.yml
@@ -15,7 +15,7 @@
 
 site_name: Spark SQL, Built-in Functions
 theme: readthedocs
-pages:
+nav:
   - 'Functions': 'index.md'
 markdown_extensions:
   - toc:

From e086951349391ab6d7e7bef9699acfdb146ceb6f Mon Sep 17 00:00:00 2001
From: Prakhar Jain <prakharjain09@gmail.com>
Date: Wed, 19 Feb 2020 20:23:22 +0800
Subject: [PATCH 093/185] [SPARK-30786][CORE] Fix Block replication failure
 propogation issue in BlockManager

### What changes were proposed in this pull request?
Currently the uploadBlockSync api in BlockTransferService always succeeds irrespective of whether the BlockManager was able to successfully replicate a block on peer block manager or not. This PR makes sure that the NettyBlockRpcServer invokes onFailure callback when it is not able to replicate the block to itself because of any reason. The onFailure callback makes sure that the BlockTransferService on client side gets the failure and retry replication the Block on some other BlockManager.

### Why are the changes needed?
Currently the Spark Block replication retry logic is not working correctly. It doesn't retry on other Block managers even when replication fails on 1 of the peers.

A user can cache an DataFrame with different replication factor. Ex - df.persist(StorageLevel.MEMORY_ONLY_2) - This will cache each partition at two different BlockManagers. When a DataFrame partition is computed first time, it is firstly stored locally on the local BlockManager and then it is replicated to other block managers based on replication factor config. The replication of block to other block managers might fail because of memory/network etc issues and so there is already provision to retry the replication on some other peer based on "spark.storage.maxReplicationFailures" config, Currently when this replication fails, the client does not know about the failure and so it doesn't retry on other peers. This PR fixes this issue.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Added Unit Test.

Closes #27539 from prakharjain09/bm_replicate.

Authored-by: Prakhar Jain <prakharjain09@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../network/netty/NettyBlockRpcServer.scala   | 10 +++-
 .../apache/spark/storage/BlockManager.scala   |  6 +-
 .../BlockManagerReplicationSuite.scala        | 59 ++++++++++++++++++-
 3 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index b3904f3362e8e..62726f7e147c5 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -105,8 +105,14 @@ class NettyBlockRpcServer(
         val blockId = BlockId(uploadBlock.blockId)
         logDebug(s"Receiving replicated block $blockId with level ${level} " +
           s"from ${client.getSocketAddress}")
-        blockManager.putBlockData(blockId, data, level, classTag)
-        responseContext.onSuccess(ByteBuffer.allocate(0))
+        val blockStored = blockManager.putBlockData(blockId, data, level, classTag)
+        if (blockStored) {
+          responseContext.onSuccess(ByteBuffer.allocate(0))
+        } else {
+          val exception = new Exception(s"Upload block for $blockId failed. This mostly happens " +
+            s"when there is not sufficient space available to store the block.")
+          responseContext.onFailure(exception)
+        }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index c47901314f53a..e7f8de5ab7e4a 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -666,7 +666,11 @@ private[spark] class BlockManager(
         // stream.
         channel.close()
         val blockSize = channel.getCount
-        TempFileBasedBlockStoreUpdater(blockId, level, classTag, tmpFile, blockSize).save()
+        val blockStored = TempFileBasedBlockStoreUpdater(
+          blockId, level, classTag, tmpFile, blockSize).save()
+        if (!blockStored) {
+          throw new Exception(s"Failure while trying to store block $blockId on $blockManagerId.")
+        }
       }
 
       override def onFailure(streamId: String, cause: Throwable): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index 59ace850d0bd2..660bfcfc48267 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -24,7 +24,8 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
 import scala.language.implicitConversions
 
-import org.mockito.Mockito.{mock, when}
+import org.mockito.ArgumentMatchers.any
+import org.mockito.Mockito.{doAnswer, mock, spy, when}
 import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
@@ -69,11 +70,12 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite
 
   protected def makeBlockManager(
       maxMem: Long,
-      name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
+      name: String = SparkContext.DRIVER_IDENTIFIER,
+      memoryManager: Option[UnifiedMemoryManager] = None): BlockManager = {
     conf.set(TEST_MEMORY, maxMem)
     conf.set(MEMORY_OFFHEAP_SIZE, maxMem)
     val transfer = new NettyBlockTransferService(conf, securityMgr, "localhost", "localhost", 0, 1)
-    val memManager = UnifiedMemoryManager(conf, numCores = 1)
+    val memManager = memoryManager.getOrElse(UnifiedMemoryManager(conf, numCores = 1))
     val serializerManager = new SerializerManager(serializer, conf)
     val store = new BlockManager(name, rpcEnv, master, serializerManager, conf,
       memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, None)
@@ -255,6 +257,43 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite
     }
   }
 
+  Seq(false, true).foreach { stream =>
+    test(s"test block replication failures when block is received " +
+      s"by remote block manager but putBlock fails (stream = $stream)") {
+      // Retry replication logic for 1 failure
+      conf.set(STORAGE_MAX_REPLICATION_FAILURE, 1)
+      // Custom block replication policy which prioritizes BlockManagers as per hostnames
+      conf.set(STORAGE_REPLICATION_POLICY, classOf[SortOnHostNameBlockReplicationPolicy].getName)
+      // To use upload block stream flow, set maxRemoteBlockSizeFetchToMem to 0
+      val maxRemoteBlockSizeFetchToMem = if (stream) 0 else Int.MaxValue - 512
+      conf.set(MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM, maxRemoteBlockSizeFetchToMem.toLong)
+
+      // Create 2 normal block manager
+      val store1 = makeBlockManager(10000, "host-1")
+      val store3 = makeBlockManager(10000, "host-3")
+
+      // create 1 faulty block manager by injecting faulty memory manager
+      val memManager = UnifiedMemoryManager(conf, numCores = 1)
+      val mockedMemoryManager = spy(memManager)
+      doAnswer(_ => false).when(mockedMemoryManager).acquireStorageMemory(any(), any(), any())
+      val store2 = makeBlockManager(10000, "host-2", Some(mockedMemoryManager))
+
+      assert(master.getPeers(store1.blockManagerId).toSet ===
+        Set(store2.blockManagerId, store3.blockManagerId))
+
+      val blockId = "blockId"
+      val message = new Array[Byte](1000)
+
+      // Replication will be tried by store1 in this order: store2, store3
+      // store2 is faulty block manager, so it won't be able to put block
+      // Then store1 will try to replicate block on store3
+      store1.putSingle(blockId, message, StorageLevel.MEMORY_ONLY_SER_2)
+
+      val blockLocations = master.getLocations(blockId).toSet
+      assert(blockLocations === Set(store1.blockManagerId, store3.blockManagerId))
+    }
+  }
+
   test("block replication - addition and deletion of block managers") {
     val blockSize = 1000
     val storeSize = 10000
@@ -509,3 +548,17 @@ class BlockManagerBasicStrategyReplicationSuite extends BlockManagerReplicationB
     classOf[DummyTopologyMapper].getName)
 }
 
+// BlockReplicationPolicy to prioritize BlockManagers based on hostnames
+// Examples - for BM-x(host-2), BM-y(host-1), BM-z(host-3), it will prioritize them as
+// BM-y(host-1), BM-x(host-2), BM-z(host-3)
+class SortOnHostNameBlockReplicationPolicy
+  extends BlockReplicationPolicy {
+  override def prioritize(
+      blockManagerId: BlockManagerId,
+      peers: Seq[BlockManagerId],
+      peersReplicatedTo: mutable.HashSet[BlockManagerId],
+      blockId: BlockId,
+      numReplicas: Int): List[BlockManagerId] = {
+    peers.sortBy(_.host).toList
+  }
+}

From c7bece354132eef3677004bd796f82ef72f85bd1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 19 Feb 2020 22:26:56 +0800
Subject: [PATCH 094/185] [SPARK-27528][FOLLOWUP] improve migration guide

### What changes were proposed in this pull request?

mention that `INT96` timestamp is still useful for interoperability.

### Why are the changes needed?

Give users more context of the behavior changes.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #27622 from cloud-fan/parquet.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 76df66b9de50c..0690127a45b17 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -87,7 +87,7 @@ license: |
 
   - In Spark version 2.4, when a spark session is created via `cloneSession()`, the newly created spark session inherits its configuration from its parent `SparkContext` even though the same configuration may exist with a different value in its parent spark session. Since Spark 3.0, the configurations of a parent `SparkSession` have a higher precedence over the parent `SparkContext`. The old behavior can be restored by setting `spark.sql.legacy.sessionInitWithConfigDefaults` to `true`.
 
-  - Since Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by default while saving `TIMESTAMP` columns. In Spark version 2.4 and earlier, `TIMESTAMP` columns are saved as `INT96` in parquet files. To set `INT96` to `spark.sql.parquet.outputTimestampType` restores the previous behavior.
+  - Since Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by default while saving `TIMESTAMP` columns. In Spark version 2.4 and earlier, `TIMESTAMP` columns are saved as `INT96` in parquet files. Note that, some SQL systems such as Hive 1.x and Impala 2.x can only read `INT96` timestamps, you can set `spark.sql.parquet.outputTimestampType` as `INT96` to restore the previous behavior and keep interoperability.
 
   - Since Spark 3.0, if `hive.default.fileformat` is not found in `Spark SQL configuration` then it will fallback to hive-site.xml present in the `Hadoop configuration` of `SparkContext`.
 

From 46019b6e6ce334db4649e269e0ad6e7e4d4ab6c2 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Wed, 19 Feb 2020 10:47:59 -0600
Subject: [PATCH 095/185] [MINOR][DOCS] Fix fabric8 version in documentation

### What changes were proposed in this pull request?

fix kubernetes-client version doc

### Why are the changes needed?

correct doc

### Does this PR introduce any user-facing change?

nah
### How was this patch tested?

nah

Closes #27605 from yaooqinn/k8s-version-update.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 docs/running-on-kubernetes.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 53b883b6bbf30..a941b8c04cb54 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -77,8 +77,7 @@ logs and remains in "completed" state in the Kubernetes API until it's eventuall
 
 Note that in the completed state, the driver pod does *not* use any computational or memory resources.
 
-The driver and executor pod scheduling is handled by Kubernetes. Communication to the Kubernetes API is done via fabric8, and we are
-currently running <code>kubernetes-client</code> version <code>4.1.0</code>. Make sure that when you are making infrastructure additions that you are aware of said version. It is possible to schedule the
+The driver and executor pod scheduling is handled by Kubernetes. Communication to the Kubernetes API is done via fabric8. It is possible to schedule the
 driver and executor pods on a subset of available nodes through a [node selector](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
 using the configuration property for it. It will be possible to use more advanced
 scheduling hints like [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity) in a future release.

From c92d437c4639c3e5ae3bfc38bc70e7c38fe4cfad Mon Sep 17 00:00:00 2001
From: herman <herman@databricks.com>
Date: Wed, 19 Feb 2020 10:17:46 -0800
Subject: [PATCH 096/185] [SPARK-30811][SQL] CTE should not cause stack
 overflow when it refers to non-existent table with same name

### Why are the changes needed?
This ports the tests introduced in https://github.com/apache/spark/commit/7285eea6839d40cbac15101c633a9a572eb3b603 to master to avoid future regressions.

### Background
A query with Common Table Expressions can cause a stack overflow when it contains a CTE that refers a non-existing table with the same name. The name of the table need to have a database qualifier. This is caused by a couple of things:

- CTESubstitution runs analysis on the CTE, but this does not throw an exception because the table has a database qualifier. The reason is that we don't fail is because we re-attempt to resolve the relation in a later rule;
- CTESubstitution replace logic does not check if the table it is replacing has a database, it shouldn't replace the relation if it does. So now we will happily replace nonexist.t with t;

Note that this not an issue for master or the spark-3.0 branch.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Added regression test to `AnalysisErrorSuite` and `DataFrameSuite`.

Closes #27635 from hvanhovell/SPARK-30811-master.

Authored-by: herman <herman@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../sql/catalyst/analysis/AnalysisErrorSuite.scala   | 12 ++++++++++++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 5cc0453135c07..8f62b0b9b77a2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.scalatest.Assertions._
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
@@ -652,4 +653,15 @@ class AnalysisErrorSuite extends AnalysisTest {
     assertAnalysisError(plan,
       "Aggregate/Window/Generate expressions are not valid in where clause of the query" :: Nil)
   }
+
+  test("SPARK-30811: CTE should not cause stack overflow when " +
+    "it refers to non-existent table with same name") {
+    val plan = With(
+      UnresolvedRelation(TableIdentifier("t")),
+      Seq("t" -> SubqueryAlias("t",
+        Project(
+          Alias(Literal(1), "x")() :: Nil,
+          UnresolvedRelation(TableIdentifier("t", Option("nonexist")))))))
+    assertAnalysisError(plan, "Table or view not found:" :: Nil)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 694e576fcded4..42a90735368f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2298,6 +2298,14 @@ class DataFrameSuite extends QueryTest
         fail("emptyDataFrame should be foldable")
     }
   }
+
+  test("SPARK-30811: CTE should not cause stack overflow when " +
+    "it refers to non-existent table with same name") {
+    val e = intercept[AnalysisException] {
+      sql("WITH t AS (SELECT 1 FROM nonexist.t) SELECT * FROM t")
+    }
+    assert(e.getMessage.contains("Table or view not found:"))
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)

From af63971cb7a5e7c7cb23ff1f87e5838d54c59a7d Mon Sep 17 00:00:00 2001
From: sarthfrey-db <sarth.frey@databricks.com>
Date: Thu, 13 Feb 2020 16:15:00 -0800
Subject: [PATCH 097/185] [SPARK-30667][CORE] Add allGather method to
 BarrierTaskContext

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
...     context = BarrierTaskContext.get()
...     return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27395 from sarthfrey/master.

Lead-authored-by: sarthfrey-db <sarth.frey@databricks.com>
Co-authored-by: sarthfrey <sarth.frey@gmail.com>
Signed-off-by: Xiangrui Meng <meng@databricks.com>
(cherry picked from commit 57254c9719f9af9ad985596ed7fbbaafa4052002)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 .../org/apache/spark/BarrierCoordinator.scala | 113 +++++++++++--
 .../org/apache/spark/BarrierTaskContext.scala | 153 ++++++++++++------
 .../spark/api/python/PythonRunner.scala       |  51 ++++--
 .../scheduler/BarrierTaskContextSuite.scala   |  74 +++++++++
 python/pyspark/taskcontext.py                 |  49 +++++-
 python/pyspark/tests/test_taskcontext.py      |  20 +++
 6 files changed, 381 insertions(+), 79 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e417679ca663..042a2664a0e27 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
     // reset when a barrier() call fails due to timeout.
     private var barrierEpoch: Int = 0
 
-    // An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier()
-    // call.
+    // An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call
     private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks)
 
+    // An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call
+    private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer]
+
+    // The blocking requestMethod called by tasks to sync up for this stage attempt
+    private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER
+
     // A timer task that ensures we may timeout for a barrier() call.
     private var timerTask: TimerTask = null
 
@@ -130,9 +140,32 @@ private[spark] class BarrierCoordinator(
 
     // Process the global sync request. The barrier() call succeed if collected enough requests
     // within a configured time, otherwise fail all the pending requests.
-    def handleRequest(requester: RpcCallContext, request: RequestToSync): Unit = synchronized {
+    def handleRequest(
+      requester: RpcCallContext,
+      request: RequestToSync
+    ): Unit = synchronized {
       val taskId = request.taskAttemptId
       val epoch = request.barrierEpoch
+      val requestMethod = request.requestMethod
+      val partitionId = request.partitionId
+      val allGatherMessage = request match {
+        case ag: AllGatherRequestToSync => ag.allGatherMessage
+        case _ => ""
+      }
+
+      if (requesters.size == 0) {
+        requestMethodToSync = requestMethod
+      }
+
+      if (requestMethodToSync != requestMethod) {
+        requesters.foreach(
+          _.sendFailure(new SparkException(s"$barrierId tried to use requestMethod " +
+            s"`$requestMethod` during barrier epoch $barrierEpoch, which does not match " +
+            s"the current synchronized requestMethod `$requestMethodToSync`"
+          ))
+        )
+        cleanupBarrierStage(barrierId)
+      }
 
       // Require the number of tasks is correctly set from the BarrierTaskContext.
       require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " +
@@ -153,6 +186,7 @@ private[spark] class BarrierCoordinator(
         }
         // Add the requester to array of RPCCallContexts pending for reply.
         requesters += requester
+        allGatherMessages(partitionId) = allGatherMessage
         logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " +
           s"$taskId, current progress: ${requesters.size}/$numTasks.")
         if (maybeFinishAllRequesters(requesters, numTasks)) {
@@ -162,6 +196,7 @@ private[spark] class BarrierCoordinator(
             s"tasks, finished successfully.")
           barrierEpoch += 1
           requesters.clear()
+          allGatherMessages.clear()
           cancelTimerTask()
         }
       }
@@ -173,7 +208,13 @@ private[spark] class BarrierCoordinator(
         requesters: ArrayBuffer[RpcCallContext],
         numTasks: Int): Boolean = {
       if (requesters.size == numTasks) {
-        requesters.foreach(_.reply(()))
+        requestMethodToSync match {
+          case RequestMethod.BARRIER =>
+            requesters.foreach(_.reply(""))
+          case RequestMethod.ALL_GATHER =>
+            val json: String = compact(render(allGatherMessages))
+            requesters.foreach(_.reply(json))
+        }
         true
       } else {
         false
@@ -186,6 +227,7 @@ private[spark] class BarrierCoordinator(
       // messages come from current stage attempt shall fail.
       barrierEpoch = -1
       requesters.clear()
+      allGatherMessages.clear()
       cancelTimerTask()
     }
   }
@@ -199,11 +241,11 @@ private[spark] class BarrierCoordinator(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) =>
+    case request: RequestToSync =>
       // Get or init the ContextBarrierState correspond to the stage attempt.
-      val barrierId = ContextBarrierId(stageId, stageAttemptId)
+      val barrierId = ContextBarrierId(request.stageId, request.stageAttemptId)
       states.computeIfAbsent(barrierId,
-        (key: ContextBarrierId) => new ContextBarrierState(key, numTasks))
+        (key: ContextBarrierId) => new ContextBarrierState(key, request.numTasks))
       val barrierState = states.get(barrierId)
 
       barrierState.handleRequest(context, request)
@@ -216,6 +258,16 @@ private[spark] class BarrierCoordinator(
 
 private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
 
+private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
+  def numTasks: Int
+  def stageId: Int
+  def stageAttemptId: Int
+  def taskAttemptId: Long
+  def barrierEpoch: Int
+  def partitionId: Int
+  def requestMethod: RequestMethod.Value
+}
+
 /**
  * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is
  * identified by stageId + stageAttemptId + barrierEpoch.
@@ -224,11 +276,44 @@ private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
  * @param stageId ID of current stage
  * @param stageAttemptId ID of current stage attempt
  * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls.
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
  */
-private[spark] case class RequestToSync(
-    numTasks: Int,
-    stageId: Int,
-    stageAttemptId: Int,
-    taskAttemptId: Long,
-    barrierEpoch: Int) extends BarrierCoordinatorMessage
+private[spark] case class BarrierRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value
+) extends RequestToSync
+
+/**
+ * A global sync request message from BarrierTaskContext, by `allGather()` call. Each request is
+ * identified by stageId + stageAttemptId + barrierEpoch.
+ *
+ * @param numTasks The number of global sync requests the BarrierCoordinator shall receive
+ * @param stageId ID of current stage
+ * @param stageAttemptId ID of current stage attempt
+ * @param taskAttemptId Unique ID of current task
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
+ * @param allGatherMessage Message sent from the BarrierTaskContext if requestMethod is ALL_GATHER
+ */
+private[spark] case class AllGatherRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value,
+  allGatherMessage: String
+) extends RequestToSync
+
+private[spark] object RequestMethod extends Enumeration {
+  val BARRIER, ALL_GATHER = Value
+}
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 3d369802f3023..2263538a11676 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,11 +17,19 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Properties, Timer, TimerTask}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
@@ -59,49 +67,31 @@ class BarrierTaskContext private[spark] (
   // from different tasks within the same barrier stage attempt to succeed.
   private lazy val numTasks = getTaskInfos().size
 
-  /**
-   * :: Experimental ::
-   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
-   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
-   * stage have reached this routine.
-   *
-   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
-   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
-   * timeout. Some examples of '''misuses''' are listed below:
-   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
-   * shall lead to timeout of the function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       if (context.partitionId() == 0) {
-   *           // Do nothing.
-   *       } else {
-   *           context.barrier()
-   *       }
-   *       iter
-   *   }
-   * }}}
-   *
-   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
-   * second function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       try {
-   *           // Do something that might throw an Exception.
-   *           doSomething()
-   *           context.barrier()
-   *       } catch {
-   *           case e: Exception => logWarning("...", e)
-   *       }
-   *       context.barrier()
-   *       iter
-   *   }
-   * }}}
-   */
-  @Experimental
-  @Since("2.4.0")
-  def barrier(): Unit = {
+  private def getRequestToSync(
+    numTasks: Int,
+    stageId: Int,
+    stageAttemptNumber: Int,
+    taskAttemptId: Long,
+    barrierEpoch: Int,
+    partitionId: Int,
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String
+  ): RequestToSync = {
+    requestMethod match {
+      case RequestMethod.BARRIER =>
+        BarrierRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod)
+      case RequestMethod.ALL_GATHER =>
+        AllGatherRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod, allGatherMessage)
+    }
+  }
+
+  private def runBarrier(
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String = ""
+  ): String = {
+
     logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " +
       s"the global sync, current barrier epoch is $barrierEpoch.")
     logTrace("Current callSite: " + Utils.getCallSite())
@@ -118,10 +108,12 @@ class BarrierTaskContext private[spark] (
     // Log the update of global sync every 60 seconds.
     timer.schedule(timerTask, 60000, 60000)
 
+    var json: String = ""
+
     try {
-      val abortableRpcFuture = barrierCoordinator.askAbortable[Unit](
-        message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch),
+      val abortableRpcFuture = barrierCoordinator.askAbortable[String](
+        message = getRequestToSync(numTasks, stageId, stageAttemptNumber,
+          taskAttemptId, barrierEpoch, partitionId, requestMethod, allGatherMessage),
         // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by
         // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework.
         timeout = new RpcTimeout(365.days, "barrierTimeout"))
@@ -133,7 +125,7 @@ class BarrierTaskContext private[spark] (
         while (!abortableRpcFuture.toFuture.isCompleted) {
           // wait RPC future for at most 1 second
           try {
-            ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
+            json = ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
           } catch {
             case _: TimeoutException | _: InterruptedException =>
               // If `TimeoutException` thrown, waiting RPC future reach 1 second.
@@ -163,6 +155,73 @@ class BarrierTaskContext private[spark] (
       timerTask.cancel()
       timer.purge()
     }
+    json
+  }
+
+  /**
+   * :: Experimental ::
+   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
+   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
+   * stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
+   * timeout. Some examples of '''misuses''' are listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       if (context.partitionId() == 0) {
+   *           // Do nothing.
+   *       } else {
+   *           context.barrier()
+   *       }
+   *       iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       try {
+   *           // Do something that might throw an Exception.
+   *           doSomething()
+   *           context.barrier()
+   *       } catch {
+   *           case e: Exception => logWarning("...", e)
+   *       }
+   *       context.barrier()
+   *       iter
+   *   }
+   * }}}
+   */
+  @Experimental
+  @Since("2.4.0")
+  def barrier(): Unit = {
+    runBarrier(RequestMethod.BARRIER)
+    ()
+  }
+
+  /**
+   * :: Experimental ::
+   * Blocks until all tasks in the same stage have reached this routine. Each task passes in
+   * a message and returns with a list of all the messages passed in by each of those tasks.
+   *
+   * CAUTION! The allGather method requires the same precautions as the barrier method
+   *
+   * The message is type String rather than Array[Byte] because it is more convenient for
+   * the user at the cost of worse performance.
+   */
+  @Experimental
+  @Since("3.0.0")
+  def allGather(message: String): ArrayBuffer[String] = {
+    val json = runBarrier(RequestMethod.ALL_GATHER, message)
+    val jsonArray = parse(json)
+    implicit val formats = DefaultFormats
+    ArrayBuffer(jsonArray.extract[Array[String]]: _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 658e0d593a167..fa8bf0fc06358 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,8 +24,13 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
@@ -238,13 +243,18 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
                   sock.setSoTimeout(10000)
                   authHelper.authClient(sock)
                   val input = new DataInputStream(sock.getInputStream())
-                  input.readInt() match {
+                  val requestMethod = input.readInt()
+                  // The BarrierTaskContext function may wait infinitely, socket shall not timeout
+                  // before the function finishes.
+                  sock.setSoTimeout(0)
+                  requestMethod match {
                     case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-                      // The barrier() function may wait infinitely, socket shall not timeout
-                      // before the function finishes.
-                      sock.setSoTimeout(0)
-                      barrierAndServe(sock)
-
+                      barrierAndServe(requestMethod, sock)
+                    case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+                      val length = input.readInt()
+                      val message = new Array[Byte](length)
+                      input.readFully(message)
+                      barrierAndServe(requestMethod, sock, new String(message, UTF_8))
                     case _ =>
                       val out = new DataOutputStream(new BufferedOutputStream(
                         sock.getOutputStream))
@@ -395,15 +405,31 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     }
 
     /**
-     * Gateway to call BarrierTaskContext.barrier().
+     * Gateway to call BarrierTaskContext methods.
      */
-    def barrierAndServe(sock: Socket): Unit = {
-      require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.")
-
+    def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = {
+      require(
+        serverSocket.isDefined,
+        "No available ServerSocket to redirect the BarrierTaskContext method call."
+      )
       val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
       try {
-        context.asInstanceOf[BarrierTaskContext].barrier()
-        writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out)
+        var result: String = ""
+        requestMethod match {
+          case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
+            context.asInstanceOf[BarrierTaskContext].barrier()
+            result = BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS
+          case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+            val messages: ArrayBuffer[String] = context.asInstanceOf[BarrierTaskContext].allGather(
+              message
+            )
+            result = compact(render(JArray(
+              messages.map(
+                (message) => JString(message)
+              ).toList
+            )))
+        }
+        writeUTF(result, out)
       } catch {
         case e: SparkException =>
           writeUTF(e.getMessage, out)
@@ -638,6 +664,7 @@ private[spark] object SpecialLengths {
 
 private[spark] object BarrierTaskContextMessageProtocol {
   val BARRIER_FUNCTION = 1
+  val ALL_GATHER_FUNCTION = 2
   val BARRIER_RESULT_SUCCESS = "success"
   val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side."
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
index fc8ac38479932..ed38b7f0ecac1 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.io.File
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 import org.apache.spark._
@@ -52,6 +53,79 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext {
     assert(times.max - times.min <= 1000)
   }
 
+  test("share messages with allGather() call") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message = context.partitionId().toString
+      val messages = context.allGather(message)
+      messages.toList.iterator
+    }
+    // Take a sorted list of all the partitionId messages
+    val messages = rdd2.collect().head
+    // All the task partitionIds are shared
+    for((x, i) <- messages.view.zipWithIndex) assert(x == i.toString)
+  }
+
+  test("throw exception if we attempt to synchronize with different blocking calls") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      val partitionId = context.partitionId
+      if (partitionId == 0) {
+        context.barrier()
+      } else {
+        context.allGather(partitionId.toString)
+      }
+      Seq(null).iterator
+    }
+    val error = intercept[SparkException] {
+      rdd2.collect()
+    }.getMessage
+    assert(error.contains("does not match the current synchronized requestMethod"))
+  }
+
+  test("successively sync with allGather and barrier") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      context.barrier()
+      val time1 = System.currentTimeMillis()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message = context.partitionId().toString
+      val messages = context.allGather(message)
+      val time2 = System.currentTimeMillis()
+      Seq((time1, time2)).iterator
+    }
+    val times = rdd2.collect()
+    // All the tasks shall finish the first round of global sync within a short time slot.
+    val times1 = times.map(_._1)
+    assert(times1.max - times1.min <= 1000)
+
+    // All the tasks shall finish the second round of global sync within a short time slot.
+    val times2 = times.map(_._2)
+    assert(times2.max - times2.min <= 1000)
+  }
+
   test("support multiple barrier() call within a single task") {
     initLocalClusterSparkContext()
     val rdd = sc.makeRDD(1 to 10, 4)
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index d648f63338514..90bd2345ac525 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,9 +16,10 @@
 #
 
 from __future__ import print_function
+import json
 
 from pyspark.java_gateway import local_connect_and_auth
-from pyspark.serializers import write_int, UTF8Deserializer
+from pyspark.serializers import write_int, write_with_length, UTF8Deserializer
 
 
 class TaskContext(object):
@@ -107,18 +108,28 @@ def resources(self):
 
 
 BARRIER_FUNCTION = 1
+ALL_GATHER_FUNCTION = 2
 
 
-def _load_from_socket(port, auth_secret):
+def _load_from_socket(port, auth_secret, function, all_gather_message=None):
     """
     Load data from a given socket, this is a blocking method thus only return when the socket
     connection has been closed.
     """
     (sockfile, sock) = local_connect_and_auth(port, auth_secret)
-    # The barrier() call may block forever, so no timeout
+
+    # The call may block forever, so no timeout
     sock.settimeout(None)
-    # Make a barrier() function call.
-    write_int(BARRIER_FUNCTION, sockfile)
+
+    if function == BARRIER_FUNCTION:
+        # Make a barrier() function call.
+        write_int(function, sockfile)
+    elif function == ALL_GATHER_FUNCTION:
+        # Make a all_gather() function call.
+        write_int(function, sockfile)
+        write_with_length(all_gather_message.encode("utf-8"), sockfile)
+    else:
+        raise ValueError("Unrecognized function type")
     sockfile.flush()
 
     # Collect result.
@@ -199,7 +210,33 @@ def barrier(self):
             raise Exception("Not supported to call barrier() before initialize " +
                             "BarrierTaskContext.")
         else:
-            _load_from_socket(self._port, self._secret)
+            _load_from_socket(self._port, self._secret, BARRIER_FUNCTION)
+
+    def allGather(self, message=""):
+        """
+        .. note:: Experimental
+
+        This function blocks until all tasks in the same stage have reached this routine.
+        Each task passes in a message and returns with a list of all the messages passed in
+        by each of those tasks.
+
+        .. warning:: In a barrier stage, each task much have the same number of `allGather()`
+            calls, in all possible code branches.
+            Otherwise, you may get the job hanging or a SparkException after timeout.
+        """
+        if not isinstance(message, str):
+            raise ValueError("Argument `message` must be of type `str`")
+        elif self._port is None or self._secret is None:
+            raise Exception("Not supported to call barrier() before initialize " +
+                            "BarrierTaskContext.")
+        else:
+            gathered_items = _load_from_socket(
+                self._port,
+                self._secret,
+                ALL_GATHER_FUNCTION,
+                message,
+            )
+            return [e for e in json.loads(gathered_items)]
 
     def getTaskInfos(self):
         """
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 68cfe814762e0..0053aadd9c3ed 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -135,6 +135,26 @@ def context_barrier(x):
         times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
         self.assertTrue(max(times) - min(times) < 1)
 
+    def test_all_gather(self):
+        """
+        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
+        within a stage and passes messages properly.
+        """
+        rdd = self.sc.parallelize(range(10), 4)
+
+        def f(iterator):
+            yield sum(iterator)
+
+        def context_barrier(x):
+            tc = BarrierTaskContext.get()
+            time.sleep(random.randint(1, 10))
+            out = tc.allGather(str(context.partitionId()))
+            pids = [int(e) for e in out]
+            return [pids]
+
+        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
+        self.assertTrue(pids == [0, 1, 2, 3])
+
     def test_barrier_infos(self):
         """
         Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the

From 69367997c3c2bc11fc11a267e61f391525a3f9c9 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 19 Feb 2020 17:29:52 -0600
Subject: [PATCH 098/185] [SPARK-30802][ML] Use Summarizer instead of
 MultivariateOnlineSummarizer in Aggregator test suite

### What changes were proposed in this pull request?
There are three changes in this PR:
1. use Summarizer instead of MultivariateOnlineSummarizer in Aggregator test suites (similar to https://github.com/apache/spark/pull/26396)
2. Put common code in ```Summarizer.getRegressionSummarizers``` and ```Summarizer.getClassificationSummarizers```.
3. Move ```MultiClassSummarizer``` from ```LogisticRegression``` to ```ml.stat``` (this seems to be a better place since ```MultiClassSummarizer``` is not only used by ```LogisticRegression``` but also several other classes).

### Why are the changes needed?
Minimize code duplication and improve performance

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing test suites.

Closes #27555 from huaxingao/spark-30802.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/ml/classification/LinearSVC.scala   | 12 +--
 .../classification/LogisticRegression.scala   | 92 +----------------
 .../ml/regression/LinearRegression.scala      | 14 +--
 .../spark/ml/stat/MultiClassSummarizer.scala  | 98 +++++++++++++++++++
 .../org/apache/spark/ml/stat/Summarizer.scala | 34 +++++++
 .../LogisticRegressionSuite.scala             | 67 +------------
 .../DifferentiableLossAggregatorSuite.scala   | 37 -------
 .../aggregator/HingeAggregatorSuite.scala     | 11 +--
 .../aggregator/HuberAggregatorSuite.scala     | 11 +--
 .../LeastSquaresAggregatorSuite.scala         | 17 ++--
 .../aggregator/LogisticAggregatorSuite.scala  | 17 ++--
 .../ml/stat/MultiClassSummarizerSuite.scala   | 91 +++++++++++++++++
 12 files changed, 258 insertions(+), 243 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/MultiClassSummarizer.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/stat/MultiClassSummarizerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 905789090d625..efe84f89d81a1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.optim.aggregator.HingeAggregator
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
@@ -169,15 +168,8 @@ class LinearSVC @Since("2.2.0") (
     instr.logParams(this, labelCol, weightCol, featuresCol, predictionCol, rawPredictionCol,
       regParam, maxIter, fitIntercept, tol, standardization, threshold, aggregationDepth)
 
-    val (summarizer, labelSummarizer) = instances.treeAggregate(
-      (Summarizer.createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
-      seqOp = (c: (SummarizerBuffer, MultiClassSummarizer), instance: Instance) =>
-        (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)),
-      combOp = (c1: (SummarizerBuffer, MultiClassSummarizer),
-                c2: (SummarizerBuffer, MultiClassSummarizer)) =>
-        (c1._1.merge(c2._1), c1._2.merge(c2._2)),
-      depth = $(aggregationDepth)
-    )
+    val (summarizer, labelSummarizer) =
+      Summarizer.getClassificationSummarizers(instances, $(aggregationDepth))
     instr.logNumExamples(summarizer.count)
     instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
     instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 50c14d086957f..25678ea6a8f75 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -28,7 +28,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.optim.aggregator.LogisticAggregator
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
@@ -500,15 +499,8 @@ class LogisticRegression @Since("1.2.0") (
       probabilityCol, regParam, elasticNetParam, standardization, threshold, maxIter, tol,
       fitIntercept)
 
-    val (summarizer, labelSummarizer) = instances.treeAggregate(
-      (Summarizer.createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
-      seqOp = (c: (SummarizerBuffer, MultiClassSummarizer), instance: Instance) =>
-        (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)),
-      combOp = (c1: (SummarizerBuffer, MultiClassSummarizer),
-                c2: (SummarizerBuffer, MultiClassSummarizer)) =>
-        (c1._1.merge(c2._1), c1._2.merge(c2._2)),
-      depth = $(aggregationDepth)
-    )
+    val (summarizer, labelSummarizer) =
+      Summarizer.getClassificationSummarizers(instances, $(aggregationDepth))
 
     instr.logNumExamples(summarizer.count)
     instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
@@ -1259,86 +1251,6 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
   }
 }
 
-
-/**
- * MultiClassSummarizer computes the number of distinct labels and corresponding counts,
- * and validates the data to see if the labels used for k class multi-label classification
- * are in the range of {0, 1, ..., k - 1} in an online fashion.
- *
- * Two MultilabelSummarizer can be merged together to have a statistical summary of the
- * corresponding joint dataset.
- */
-private[ml] class MultiClassSummarizer extends Serializable {
-  // The first element of value in distinctMap is the actually number of instances,
-  // and the second element of value is sum of the weights.
-  private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
-  private var totalInvalidCnt: Long = 0L
-
-  /**
-   * Add a new label into this MultilabelSummarizer, and update the distinct map.
-   *
-   * @param label The label for this data point.
-   * @param weight The weight of this instances.
-   * @return This MultilabelSummarizer
-   */
-  def add(label: Double, weight: Double = 1.0): MultiClassSummarizer = {
-    require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
-
-    if (weight == 0.0) return this
-
-    if (label - label.toInt != 0.0 || label < 0) {
-      totalInvalidCnt += 1
-      this
-    }
-    else {
-      val (counts: Long, weightSum: Double) = distinctMap.getOrElse(label.toInt, (0L, 0.0))
-      distinctMap.put(label.toInt, (counts + 1L, weightSum + weight))
-      this
-    }
-  }
-
-  /**
-   * Merge another MultilabelSummarizer, and update the distinct map.
-   * (Note that it will merge the smaller distinct map into the larger one using in-place
-   * merging, so either `this` or `other` object will be modified and returned.)
-   *
-   * @param other The other MultilabelSummarizer to be merged.
-   * @return Merged MultilabelSummarizer object.
-   */
-  def merge(other: MultiClassSummarizer): MultiClassSummarizer = {
-    val (largeMap, smallMap) = if (this.distinctMap.size > other.distinctMap.size) {
-      (this, other)
-    } else {
-      (other, this)
-    }
-    smallMap.distinctMap.foreach {
-      case (key, value) =>
-        val (counts: Long, weightSum: Double) = largeMap.distinctMap.getOrElse(key, (0L, 0.0))
-        largeMap.distinctMap.put(key, (counts + value._1, weightSum + value._2))
-    }
-    largeMap.totalInvalidCnt += smallMap.totalInvalidCnt
-    largeMap
-  }
-
-  /** @return The total invalid input counts. */
-  def countInvalid: Long = totalInvalidCnt
-
-  /** @return The number of distinct labels in the input dataset. */
-  def numClasses: Int = if (distinctMap.isEmpty) 0 else distinctMap.keySet.max + 1
-
-  /** @return The weightSum of each label in the input dataset. */
-  def histogram: Array[Double] = {
-    val result = Array.ofDim[Double](numClasses)
-    var i = 0
-    val len = result.length
-    while (i < len) {
-      result(i) = distinctMap.getOrElse(i, (0L, 0.0))._2
-      i += 1
-    }
-    result
-  }
-}
-
 /**
  * Abstraction for logistic regression results for a given model.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 64e5e191ffd17..355055b7a9f73 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -28,7 +28,6 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{PipelineStage, PredictorParams}
-import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
 import org.apache.spark.ml.optim.WeightedLeastSquares
@@ -357,17 +356,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val (featuresSummarizer, ySummarizer) = instances.treeAggregate(
-      (Summarizer.createSummarizerBuffer("mean", "std"),
-        Summarizer.createSummarizerBuffer("mean", "std", "count")))(
-      seqOp = (c: (SummarizerBuffer, SummarizerBuffer), instance: Instance) =>
-        (c._1.add(instance.features, instance.weight),
-          c._2.add(Vectors.dense(instance.label), instance.weight)),
-      combOp = (c1: (SummarizerBuffer, SummarizerBuffer),
-                c2: (SummarizerBuffer, SummarizerBuffer)) =>
-        (c1._1.merge(c2._1), c1._2.merge(c2._2)),
-      depth = $(aggregationDepth)
-    )
+    val (featuresSummarizer, ySummarizer) =
+      Summarizer.getRegressionSummarizers(instances, $(aggregationDepth))
 
     val yMean = ySummarizer.mean(0)
     val rawYStd = ySummarizer.std(0)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/MultiClassSummarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/MultiClassSummarizer.scala
new file mode 100644
index 0000000000000..82fd0c5dfa7aa
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/MultiClassSummarizer.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import scala.collection.mutable
+
+/**
+ * MultiClassSummarizer computes the number of distinct labels and corresponding counts,
+ * and validates the data to see if the labels used for k class multi-label classification
+ * are in the range of {0, 1, ..., k - 1} in an online fashion.
+ *
+ * Two MultilabelSummarizer can be merged together to have a statistical summary of the
+ * corresponding joint dataset.
+ */
+private[ml] class MultiClassSummarizer extends Serializable {
+  // The first element of value in distinctMap is the actually number of instances,
+  // and the second element of value is sum of the weights.
+  private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
+  private var totalInvalidCnt: Long = 0L
+
+  /**
+   * Add a new label into this MultilabelSummarizer, and update the distinct map.
+   *
+   * @param label The label for this data point.
+   * @param weight The weight of this instances.
+   * @return This MultilabelSummarizer
+   */
+  def add(label: Double, weight: Double = 1.0): MultiClassSummarizer = {
+    require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
+
+    if (weight == 0.0) return this
+
+    if (label - label.toInt != 0.0 || label < 0) {
+      totalInvalidCnt += 1
+    }
+    else {
+      val (counts: Long, weightSum: Double) = distinctMap.getOrElse(label.toInt, (0L, 0.0))
+      distinctMap.put(label.toInt, (counts + 1L, weightSum + weight))
+    }
+    this
+  }
+
+  /**
+   * Merge another MultilabelSummarizer, and update the distinct map.
+   * (Note that it will merge the smaller distinct map into the larger one using in-place
+   * merging, so either `this` or `other` object will be modified and returned.)
+   *
+   * @param other The other MultilabelSummarizer to be merged.
+   * @return Merged MultilabelSummarizer object.
+   */
+  def merge(other: MultiClassSummarizer): MultiClassSummarizer = {
+    val (largeMap, smallMap) = if (this.distinctMap.size > other.distinctMap.size) {
+      (this, other)
+    } else {
+      (other, this)
+    }
+    smallMap.distinctMap.foreach {
+      case (key, value) =>
+        val (counts: Long, weightSum: Double) = largeMap.distinctMap.getOrElse(key, (0L, 0.0))
+        largeMap.distinctMap.put(key, (counts + value._1, weightSum + value._2))
+    }
+    largeMap.totalInvalidCnt += smallMap.totalInvalidCnt
+    largeMap
+  }
+
+  /** @return The total invalid input counts. */
+  def countInvalid: Long = totalInvalidCnt
+
+  /** @return The number of distinct labels in the input dataset. */
+  def numClasses: Int = if (distinctMap.isEmpty) 0 else distinctMap.keysIterator.max + 1
+
+  /** @return The weightSum of each label in the input dataset. */
+  def histogram: Array[Double] = {
+    val result = Array.ofDim[Double](numClasses)
+    var i = 0
+    val len = result.length
+    while (i < len) {
+      result(i) = distinctMap.getOrElse(i, (0L, 0.0))._2
+      i += 1
+    }
+    result
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 64a03347f0613..1183041b86bba 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -21,7 +21,9 @@ import java.io._
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes}
@@ -202,6 +204,38 @@ object Summarizer extends Logging {
     val (metrics, computeMetrics) = getRelevantMetrics(requested)
     new SummarizerBuffer(metrics, computeMetrics)
   }
+
+  /** Get regression feature and label summarizers for provided data. */
+  private[ml] def getRegressionSummarizers(
+      instances: RDD[Instance],
+      aggregationDepth: Int = 2): (SummarizerBuffer, SummarizerBuffer) = {
+    instances.treeAggregate(
+      (Summarizer.createSummarizerBuffer("mean", "std"),
+        Summarizer.createSummarizerBuffer("mean", "std", "count")))(
+      seqOp = (c: (SummarizerBuffer, SummarizerBuffer), instance: Instance) =>
+        (c._1.add(instance.features, instance.weight),
+          c._2.add(Vectors.dense(instance.label), instance.weight)),
+      combOp = (c1: (SummarizerBuffer, SummarizerBuffer),
+                c2: (SummarizerBuffer, SummarizerBuffer)) =>
+        (c1._1.merge(c2._1), c1._2.merge(c2._2)),
+      depth = aggregationDepth
+    )
+  }
+
+  /** Get classification feature and label summarizers for provided data. */
+  private[ml] def getClassificationSummarizers(
+      instances: RDD[Instance],
+      aggregationDepth: Int = 2): (SummarizerBuffer, MultiClassSummarizer) = {
+    instances.treeAggregate(
+      (Summarizer.createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
+      seqOp = (c: (SummarizerBuffer, MultiClassSummarizer), instance: Instance) =>
+        (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)),
+      combOp = (c1: (SummarizerBuffer, MultiClassSummarizer),
+                c2: (SummarizerBuffer, MultiClassSummarizer)) =>
+        (c1._1.merge(c2._1), c1._2.merge(c2._2)),
+      depth = aggregationDepth
+    )
+  }
 }
 
 private[ml] class SummaryBuilderImpl(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 6d31e6efc7e1c..6f3ceb0dded2b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix, Vector, Vectors}
 import org.apache.spark.ml.optim.aggregator.LogisticAggregator
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
+import org.apache.spark.ml.stat.MultiClassSummarizer
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -590,72 +591,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     }
   }
 
-  test("MultiClassSummarizer") {
-    val summarizer1 = (new MultiClassSummarizer)
-      .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
-    assert(summarizer1.histogram === Array[Double](1, 0, 0, 2, 1, 0, 1))
-    assert(summarizer1.countInvalid === 0)
-    assert(summarizer1.numClasses === 7)
-
-    val summarizer2 = (new MultiClassSummarizer)
-      .add(1.0).add(5.0).add(3.0).add(0.0).add(4.0).add(1.0)
-    assert(summarizer2.histogram === Array[Double](1, 2, 0, 1, 1, 1))
-    assert(summarizer2.countInvalid === 0)
-    assert(summarizer2.numClasses === 6)
-
-    val summarizer3 = (new MultiClassSummarizer)
-      .add(0.0).add(1.3).add(5.2).add(2.5).add(2.0).add(4.0).add(4.0).add(4.0).add(1.0)
-    assert(summarizer3.histogram === Array[Double](1, 1, 1, 0, 3))
-    assert(summarizer3.countInvalid === 3)
-    assert(summarizer3.numClasses === 5)
-
-    val summarizer4 = (new MultiClassSummarizer)
-      .add(3.1).add(4.3).add(2.0).add(1.0).add(3.0)
-    assert(summarizer4.histogram === Array[Double](0, 1, 1, 1))
-    assert(summarizer4.countInvalid === 2)
-    assert(summarizer4.numClasses === 4)
-
-    val summarizer5 = new MultiClassSummarizer
-    assert(summarizer5.histogram.isEmpty)
-    assert(summarizer5.numClasses === 0)
-
-    // small map merges large one
-    val summarizerA = summarizer1.merge(summarizer2)
-    assert(summarizerA.hashCode() === summarizer2.hashCode())
-    assert(summarizerA.histogram === Array[Double](2, 2, 0, 3, 2, 1, 1))
-    assert(summarizerA.countInvalid === 0)
-    assert(summarizerA.numClasses === 7)
-
-    // large map merges small one
-    val summarizerB = summarizer3.merge(summarizer4)
-    assert(summarizerB.hashCode() === summarizer3.hashCode())
-    assert(summarizerB.histogram === Array[Double](1, 2, 2, 1, 3))
-    assert(summarizerB.countInvalid === 5)
-    assert(summarizerB.numClasses === 5)
-  }
-
-  test("MultiClassSummarizer with weighted samples") {
-    val summarizer1 = (new MultiClassSummarizer)
-      .add(label = 0.0, weight = 0.2).add(3.0, 0.8).add(4.0, 3.2).add(3.0, 1.3).add(6.0, 3.1)
-    assert(Vectors.dense(summarizer1.histogram) ~==
-      Vectors.dense(Array(0.2, 0, 0, 2.1, 3.2, 0, 3.1)) absTol 1E-10)
-    assert(summarizer1.countInvalid === 0)
-    assert(summarizer1.numClasses === 7)
-
-    val summarizer2 = (new MultiClassSummarizer)
-      .add(1.0, 1.1).add(5.0, 2.3).add(3.0).add(0.0).add(4.0).add(1.0).add(2, 0.0)
-    assert(Vectors.dense(summarizer2.histogram) ~==
-      Vectors.dense(Array[Double](1.0, 2.1, 0.0, 1, 1, 2.3)) absTol 1E-10)
-    assert(summarizer2.countInvalid === 0)
-    assert(summarizer2.numClasses === 6)
-
-    val summarizer = summarizer1.merge(summarizer2)
-    assert(Vectors.dense(summarizer.histogram) ~==
-      Vectors.dense(Array(1.2, 2.1, 0.0, 3.1, 4.2, 2.3, 3.1)) absTol 1E-10)
-    assert(summarizer.countInvalid === 0)
-    assert(summarizer.numClasses === 7)
-  }
-
   test("binary logistic regression with intercept without regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
       .setWeightCol("weight")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
index 9fddf09babb0a..7a4faeb1c10bf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
@@ -17,12 +17,9 @@
 package org.apache.spark.ml.optim.aggregator
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.classification.MultiClassSummarizer
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.linalg.VectorImplicits._
-import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 
 class DifferentiableLossAggregatorSuite extends SparkFunSuite {
 
@@ -160,38 +157,4 @@ object DifferentiableLossAggregatorSuite {
       this
     }
   }
-
-  /** Get feature and label summarizers for provided data. */
-  private[ml] def getRegressionSummarizers(
-      instances: Array[Instance]): (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer) = {
-    val seqOp = (c: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
-                 instance: Instance) =>
-      (c._1.add(instance.features, instance.weight),
-        c._2.add(Vectors.dense(instance.label), instance.weight))
-
-    val combOp = (c1: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
-                  c2: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer)) =>
-      (c1._1.merge(c2._1), c1._2.merge(c2._2))
-
-    instances.aggregate(
-      (new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)
-    )(seqOp, combOp)
-  }
-
-  /** Get feature and label summarizers for provided data. */
-  private[ml] def getClassificationSummarizers(
-      instances: Array[Instance]): (MultivariateOnlineSummarizer, MultiClassSummarizer) = {
-    val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer),
-                 instance: Instance) =>
-      (c._1.add(instance.features, instance.weight),
-        c._2.add(instance.label, instance.weight))
-
-    val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer),
-                  c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) =>
-      (c1._1.merge(c2._1), c1._2.merge(c2._2))
-
-    instances.aggregate(
-      (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
-    )(seqOp, combOp)
-  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
index 61b48ffa10944..16d27a995bc60 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.ml.optim.aggregator
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.ml.stat.Summarizer
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  import DifferentiableLossAggregatorSuite.getClassificationSummarizers
-
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantFeatureFiltered: Array[Instance] = _
@@ -55,8 +54,8 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       coefficients: Vector,
       fitIntercept: Boolean): HingeAggregator = {
     val (featuresSummarizer, ySummarizer) =
-      DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
     val bcCoefficients = spark.sparkContext.broadcast(coefficients)
     new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients)
@@ -105,8 +104,8 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val coefArray = Array(1.0, 2.0)
     val intercept = 1.0
     val numFeatures = instances.head.features.size
-    val (featuresSummarizer, _) = getClassificationSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, _) = Summarizer.getClassificationSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val weightSum = instances.map(_.weight).sum
 
     val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala
index 718ffa230a749..f5de41695a47e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.ml.optim.aggregator
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.ml.stat.Summarizer
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class HuberAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  import DifferentiableLossAggregatorSuite.getRegressionSummarizers
-
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantFeatureFiltered: Array[Instance] = _
@@ -55,8 +54,8 @@ class HuberAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       parameters: Vector,
       fitIntercept: Boolean,
       epsilon: Double): HuberAggregator = {
-    val (featuresSummarizer, _) = getRegressionSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, _) = Summarizer.getRegressionSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
     val bcParameters = spark.sparkContext.broadcast(parameters)
     new HuberAggregator(fitIntercept, epsilon, bcFeaturesStd)(bcParameters)
@@ -99,8 +98,8 @@ class HuberAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("check correctness") {
     val parameters = Vectors.dense(1.0, 2.0, 3.0, 4.0)
     val numFeatures = 2
-    val (featuresSummarizer, _) = getRegressionSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, _) = Summarizer.getRegressionSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val epsilon = 1.35
     val weightSum = instances.map(_.weight).sum
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
index 35b6944624707..03ed323c9a387 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.ml.optim.aggregator
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.ml.stat.Summarizer
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  import DifferentiableLossAggregatorSuite.getRegressionSummarizers
-
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantLabel: Array[Instance] = _
@@ -54,10 +53,11 @@ class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkConte
       instances: Array[Instance],
       coefficients: Vector,
       fitIntercept: Boolean): LeastSquaresAggregator = {
-    val (featuresSummarizer, ySummarizer) = getRegressionSummarizers(instances)
-    val yStd = math.sqrt(ySummarizer.variance(0))
+    val (featuresSummarizer, ySummarizer) =
+      Summarizer.getRegressionSummarizers(sc.parallelize(instances))
+    val yStd = ySummarizer.std(0)
     val yMean = ySummarizer.mean(0)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val featuresStd = featuresSummarizer.std.toArray
     val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
     val featuresMean = featuresSummarizer.mean
     val bcFeaturesMean = spark.sparkContext.broadcast(featuresMean.toArray)
@@ -105,10 +105,11 @@ class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkConte
      */
     val coefficients = Vectors.dense(1.0, 2.0)
     val numFeatures = coefficients.size
-    val (featuresSummarizer, ySummarizer) = getRegressionSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, ySummarizer) =
+      Summarizer.getRegressionSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val featuresMean = featuresSummarizer.mean.toArray
-    val yStd = math.sqrt(ySummarizer.variance(0))
+    val yStd = ySummarizer.std(0)
     val yMean = ySummarizer.mean(0)
 
     val agg = getNewAggregator(instances, coefficients, fitIntercept = true)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
index e699adcc14c03..607823c500cb7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.ml.optim.aggregator
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Matrices, Vector, Vectors}
+import org.apache.spark.ml.stat.Summarizer
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  import DifferentiableLossAggregatorSuite.getClassificationSummarizers
-
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantFeatureFiltered: Array[Instance] = _
@@ -56,9 +55,9 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       fitIntercept: Boolean,
       isMultinomial: Boolean): LogisticAggregator = {
     val (featuresSummarizer, ySummarizer) =
-      DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances)
+      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
     val numClasses = ySummarizer.histogram.length
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val featuresStd = featuresSummarizer.std.toArray
     val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
     val bcCoefficients = spark.sparkContext.broadcast(coefficients)
     new LogisticAggregator(bcFeaturesStd, numClasses, fitIntercept, isMultinomial)(bcCoefficients)
@@ -134,8 +133,9 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val numFeatures = instances.head.features.size
     val numClasses = instances.map(_.label).toSet.size
     val intercepts = Vectors.dense(interceptArray)
-    val (featuresSummarizer, ySummarizer) = getClassificationSummarizers(instances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, ySummarizer) =
+      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
     val weightSum = instances.map(_.weight).sum
 
     val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
@@ -200,8 +200,9 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val coefArray = Array(1.0, 2.0)
     val intercept = 1.0
     val numFeatures = binaryInstances.head.features.size
-    val (featuresSummarizer, _) = getClassificationSummarizers(binaryInstances)
-    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val (featuresSummarizer, _) =
+      Summarizer.getClassificationSummarizers(sc.parallelize(binaryInstances))
+    val featuresStd = featuresSummarizer.std.toArray
     val weightSum = binaryInstances.map(_.weight).sum
 
     val agg = getNewAggregator(binaryInstances, Vectors.dense(coefArray ++ Array(intercept)),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/MultiClassSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/MultiClassSummarizerSuite.scala
new file mode 100644
index 0000000000000..76702a1fbcde1
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/MultiClassSummarizerSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.util.TestingUtils._
+
+class MultiClassSummarizerSuite extends SparkFunSuite {
+
+  test("MultiClassSummarizer") {
+    val summarizer1 = (new MultiClassSummarizer)
+      .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
+    assert(summarizer1.histogram === Array[Double](1, 0, 0, 2, 1, 0, 1))
+    assert(summarizer1.countInvalid === 0)
+    assert(summarizer1.numClasses === 7)
+
+    val summarizer2 = (new MultiClassSummarizer)
+      .add(1.0).add(5.0).add(3.0).add(0.0).add(4.0).add(1.0)
+    assert(summarizer2.histogram === Array[Double](1, 2, 0, 1, 1, 1))
+    assert(summarizer2.countInvalid === 0)
+    assert(summarizer2.numClasses === 6)
+
+    val summarizer3 = (new MultiClassSummarizer)
+      .add(0.0).add(1.3).add(5.2).add(2.5).add(2.0).add(4.0).add(4.0).add(4.0).add(1.0)
+    assert(summarizer3.histogram === Array[Double](1, 1, 1, 0, 3))
+    assert(summarizer3.countInvalid === 3)
+    assert(summarizer3.numClasses === 5)
+
+    val summarizer4 = (new MultiClassSummarizer)
+      .add(3.1).add(4.3).add(2.0).add(1.0).add(3.0)
+    assert(summarizer4.histogram === Array[Double](0, 1, 1, 1))
+    assert(summarizer4.countInvalid === 2)
+    assert(summarizer4.numClasses === 4)
+
+    val summarizer5 = new MultiClassSummarizer
+    assert(summarizer5.histogram.isEmpty)
+    assert(summarizer5.numClasses === 0)
+
+    // small map merges large one
+    val summarizerA = summarizer1.merge(summarizer2)
+    assert(summarizerA.hashCode() === summarizer2.hashCode())
+    assert(summarizerA.histogram === Array[Double](2, 2, 0, 3, 2, 1, 1))
+    assert(summarizerA.countInvalid === 0)
+    assert(summarizerA.numClasses === 7)
+
+    // large map merges small one
+    val summarizerB = summarizer3.merge(summarizer4)
+    assert(summarizerB.hashCode() === summarizer3.hashCode())
+    assert(summarizerB.histogram === Array[Double](1, 2, 2, 1, 3))
+    assert(summarizerB.countInvalid === 5)
+    assert(summarizerB.numClasses === 5)
+  }
+
+  test("MultiClassSummarizer with weighted samples") {
+    val summarizer1 = (new MultiClassSummarizer)
+      .add(label = 0.0, weight = 0.2).add(3.0, 0.8).add(4.0, 3.2).add(3.0, 1.3).add(6.0, 3.1)
+    assert(Vectors.dense(summarizer1.histogram) ~==
+      Vectors.dense(Array(0.2, 0, 0, 2.1, 3.2, 0, 3.1)) absTol 1E-10)
+    assert(summarizer1.countInvalid === 0)
+    assert(summarizer1.numClasses === 7)
+
+    val summarizer2 = (new MultiClassSummarizer)
+      .add(1.0, 1.1).add(5.0, 2.3).add(3.0).add(0.0).add(4.0).add(1.0).add(2, 0.0)
+    assert(Vectors.dense(summarizer2.histogram) ~==
+      Vectors.dense(Array[Double](1.0, 2.1, 0.0, 1, 1, 2.3)) absTol 1E-10)
+    assert(summarizer2.countInvalid === 0)
+    assert(summarizer2.numClasses === 6)
+
+    val summarizer = summarizer1.merge(summarizer2)
+    assert(Vectors.dense(summarizer.histogram) ~==
+      Vectors.dense(Array(1.2, 2.1, 0.0, 3.1, 4.2, 2.3, 3.1)) absTol 1E-10)
+    assert(summarizer.countInvalid === 0)
+    assert(summarizer.numClasses === 7)
+  }
+}

From e32411eb07475581647be641ab83da0b53a004cb Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Wed, 19 Feb 2020 17:04:47 -0800
Subject: [PATCH 099/185] Revert "[SPARK-30667][CORE] Add allGather method to
 BarrierTaskContext"

This reverts commit af63971cb7a5e7c7cb23ff1f87e5838d54c59a7d.
---
 .../org/apache/spark/BarrierCoordinator.scala | 113 ++-----------
 .../org/apache/spark/BarrierTaskContext.scala | 153 ++++++------------
 .../spark/api/python/PythonRunner.scala       |  51 ++----
 .../scheduler/BarrierTaskContextSuite.scala   |  74 ---------
 python/pyspark/taskcontext.py                 |  49 +-----
 python/pyspark/tests/test_taskcontext.py      |  20 ---
 6 files changed, 79 insertions(+), 381 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 042a2664a0e27..4e417679ca663 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,17 +17,12 @@
 
 package org.apache.spark
 
-import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.{compact, render}
-
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted}
@@ -104,15 +99,10 @@ private[spark] class BarrierCoordinator(
     // reset when a barrier() call fails due to timeout.
     private var barrierEpoch: Int = 0
 
-    // An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call
+    // An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier()
+    // call.
     private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks)
 
-    // An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call
-    private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer]
-
-    // The blocking requestMethod called by tasks to sync up for this stage attempt
-    private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER
-
     // A timer task that ensures we may timeout for a barrier() call.
     private var timerTask: TimerTask = null
 
@@ -140,32 +130,9 @@ private[spark] class BarrierCoordinator(
 
     // Process the global sync request. The barrier() call succeed if collected enough requests
     // within a configured time, otherwise fail all the pending requests.
-    def handleRequest(
-      requester: RpcCallContext,
-      request: RequestToSync
-    ): Unit = synchronized {
+    def handleRequest(requester: RpcCallContext, request: RequestToSync): Unit = synchronized {
       val taskId = request.taskAttemptId
       val epoch = request.barrierEpoch
-      val requestMethod = request.requestMethod
-      val partitionId = request.partitionId
-      val allGatherMessage = request match {
-        case ag: AllGatherRequestToSync => ag.allGatherMessage
-        case _ => ""
-      }
-
-      if (requesters.size == 0) {
-        requestMethodToSync = requestMethod
-      }
-
-      if (requestMethodToSync != requestMethod) {
-        requesters.foreach(
-          _.sendFailure(new SparkException(s"$barrierId tried to use requestMethod " +
-            s"`$requestMethod` during barrier epoch $barrierEpoch, which does not match " +
-            s"the current synchronized requestMethod `$requestMethodToSync`"
-          ))
-        )
-        cleanupBarrierStage(barrierId)
-      }
 
       // Require the number of tasks is correctly set from the BarrierTaskContext.
       require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " +
@@ -186,7 +153,6 @@ private[spark] class BarrierCoordinator(
         }
         // Add the requester to array of RPCCallContexts pending for reply.
         requesters += requester
-        allGatherMessages(partitionId) = allGatherMessage
         logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " +
           s"$taskId, current progress: ${requesters.size}/$numTasks.")
         if (maybeFinishAllRequesters(requesters, numTasks)) {
@@ -196,7 +162,6 @@ private[spark] class BarrierCoordinator(
             s"tasks, finished successfully.")
           barrierEpoch += 1
           requesters.clear()
-          allGatherMessages.clear()
           cancelTimerTask()
         }
       }
@@ -208,13 +173,7 @@ private[spark] class BarrierCoordinator(
         requesters: ArrayBuffer[RpcCallContext],
         numTasks: Int): Boolean = {
       if (requesters.size == numTasks) {
-        requestMethodToSync match {
-          case RequestMethod.BARRIER =>
-            requesters.foreach(_.reply(""))
-          case RequestMethod.ALL_GATHER =>
-            val json: String = compact(render(allGatherMessages))
-            requesters.foreach(_.reply(json))
-        }
+        requesters.foreach(_.reply(()))
         true
       } else {
         false
@@ -227,7 +186,6 @@ private[spark] class BarrierCoordinator(
       // messages come from current stage attempt shall fail.
       barrierEpoch = -1
       requesters.clear()
-      allGatherMessages.clear()
       cancelTimerTask()
     }
   }
@@ -241,11 +199,11 @@ private[spark] class BarrierCoordinator(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case request: RequestToSync =>
+    case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) =>
       // Get or init the ContextBarrierState correspond to the stage attempt.
-      val barrierId = ContextBarrierId(request.stageId, request.stageAttemptId)
+      val barrierId = ContextBarrierId(stageId, stageAttemptId)
       states.computeIfAbsent(barrierId,
-        (key: ContextBarrierId) => new ContextBarrierState(key, request.numTasks))
+        (key: ContextBarrierId) => new ContextBarrierState(key, numTasks))
       val barrierState = states.get(barrierId)
 
       barrierState.handleRequest(context, request)
@@ -258,16 +216,6 @@ private[spark] class BarrierCoordinator(
 
 private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
 
-private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
-  def numTasks: Int
-  def stageId: Int
-  def stageAttemptId: Int
-  def taskAttemptId: Long
-  def barrierEpoch: Int
-  def partitionId: Int
-  def requestMethod: RequestMethod.Value
-}
-
 /**
  * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is
  * identified by stageId + stageAttemptId + barrierEpoch.
@@ -276,44 +224,11 @@ private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
  * @param stageId ID of current stage
  * @param stageAttemptId ID of current stage attempt
  * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
- * @param partitionId ID of the current partition the task is assigned to
- * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls.
  */
-private[spark] case class BarrierRequestToSync(
-  numTasks: Int,
-  stageId: Int,
-  stageAttemptId: Int,
-  taskAttemptId: Long,
-  barrierEpoch: Int,
-  partitionId: Int,
-  requestMethod: RequestMethod.Value
-) extends RequestToSync
-
-/**
- * A global sync request message from BarrierTaskContext, by `allGather()` call. Each request is
- * identified by stageId + stageAttemptId + barrierEpoch.
- *
- * @param numTasks The number of global sync requests the BarrierCoordinator shall receive
- * @param stageId ID of current stage
- * @param stageAttemptId ID of current stage attempt
- * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
- * @param partitionId ID of the current partition the task is assigned to
- * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
- * @param allGatherMessage Message sent from the BarrierTaskContext if requestMethod is ALL_GATHER
- */
-private[spark] case class AllGatherRequestToSync(
-  numTasks: Int,
-  stageId: Int,
-  stageAttemptId: Int,
-  taskAttemptId: Long,
-  barrierEpoch: Int,
-  partitionId: Int,
-  requestMethod: RequestMethod.Value,
-  allGatherMessage: String
-) extends RequestToSync
-
-private[spark] object RequestMethod extends Enumeration {
-  val BARRIER, ALL_GATHER = Value
-}
+private[spark] case class RequestToSync(
+    numTasks: Int,
+    stageId: Int,
+    stageAttemptId: Int,
+    taskAttemptId: Long,
+    barrierEpoch: Int) extends BarrierCoordinatorMessage
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 2263538a11676..3d369802f3023 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,19 +17,11 @@
 
 package org.apache.spark
 
-import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Properties, Timer, TimerTask}
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
-import scala.language.postfixOps
-
-import org.json4s.DefaultFormats
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
@@ -67,31 +59,49 @@ class BarrierTaskContext private[spark] (
   // from different tasks within the same barrier stage attempt to succeed.
   private lazy val numTasks = getTaskInfos().size
 
-  private def getRequestToSync(
-    numTasks: Int,
-    stageId: Int,
-    stageAttemptNumber: Int,
-    taskAttemptId: Long,
-    barrierEpoch: Int,
-    partitionId: Int,
-    requestMethod: RequestMethod.Value,
-    allGatherMessage: String
-  ): RequestToSync = {
-    requestMethod match {
-      case RequestMethod.BARRIER =>
-        BarrierRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch, partitionId, requestMethod)
-      case RequestMethod.ALL_GATHER =>
-        AllGatherRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch, partitionId, requestMethod, allGatherMessage)
-    }
-  }
-
-  private def runBarrier(
-    requestMethod: RequestMethod.Value,
-    allGatherMessage: String = ""
-  ): String = {
-
+  /**
+   * :: Experimental ::
+   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
+   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
+   * stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
+   * timeout. Some examples of '''misuses''' are listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       if (context.partitionId() == 0) {
+   *           // Do nothing.
+   *       } else {
+   *           context.barrier()
+   *       }
+   *       iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       try {
+   *           // Do something that might throw an Exception.
+   *           doSomething()
+   *           context.barrier()
+   *       } catch {
+   *           case e: Exception => logWarning("...", e)
+   *       }
+   *       context.barrier()
+   *       iter
+   *   }
+   * }}}
+   */
+  @Experimental
+  @Since("2.4.0")
+  def barrier(): Unit = {
     logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " +
       s"the global sync, current barrier epoch is $barrierEpoch.")
     logTrace("Current callSite: " + Utils.getCallSite())
@@ -108,12 +118,10 @@ class BarrierTaskContext private[spark] (
     // Log the update of global sync every 60 seconds.
     timer.schedule(timerTask, 60000, 60000)
 
-    var json: String = ""
-
     try {
-      val abortableRpcFuture = barrierCoordinator.askAbortable[String](
-        message = getRequestToSync(numTasks, stageId, stageAttemptNumber,
-          taskAttemptId, barrierEpoch, partitionId, requestMethod, allGatherMessage),
+      val abortableRpcFuture = barrierCoordinator.askAbortable[Unit](
+        message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch),
         // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by
         // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework.
         timeout = new RpcTimeout(365.days, "barrierTimeout"))
@@ -125,7 +133,7 @@ class BarrierTaskContext private[spark] (
         while (!abortableRpcFuture.toFuture.isCompleted) {
           // wait RPC future for at most 1 second
           try {
-            json = ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
+            ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
           } catch {
             case _: TimeoutException | _: InterruptedException =>
               // If `TimeoutException` thrown, waiting RPC future reach 1 second.
@@ -155,73 +163,6 @@ class BarrierTaskContext private[spark] (
       timerTask.cancel()
       timer.purge()
     }
-    json
-  }
-
-  /**
-   * :: Experimental ::
-   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
-   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
-   * stage have reached this routine.
-   *
-   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
-   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
-   * timeout. Some examples of '''misuses''' are listed below:
-   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
-   * shall lead to timeout of the function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       if (context.partitionId() == 0) {
-   *           // Do nothing.
-   *       } else {
-   *           context.barrier()
-   *       }
-   *       iter
-   *   }
-   * }}}
-   *
-   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
-   * second function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       try {
-   *           // Do something that might throw an Exception.
-   *           doSomething()
-   *           context.barrier()
-   *       } catch {
-   *           case e: Exception => logWarning("...", e)
-   *       }
-   *       context.barrier()
-   *       iter
-   *   }
-   * }}}
-   */
-  @Experimental
-  @Since("2.4.0")
-  def barrier(): Unit = {
-    runBarrier(RequestMethod.BARRIER)
-    ()
-  }
-
-  /**
-   * :: Experimental ::
-   * Blocks until all tasks in the same stage have reached this routine. Each task passes in
-   * a message and returns with a list of all the messages passed in by each of those tasks.
-   *
-   * CAUTION! The allGather method requires the same precautions as the barrier method
-   *
-   * The message is type String rather than Array[Byte] because it is more convenient for
-   * the user at the cost of worse performance.
-   */
-  @Experimental
-  @Since("3.0.0")
-  def allGather(message: String): ArrayBuffer[String] = {
-    val json = runBarrier(RequestMethod.ALL_GATHER, message)
-    val jsonArray = parse(json)
-    implicit val formats = DefaultFormats
-    ArrayBuffer(jsonArray.extract[Array[String]]: _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index fa8bf0fc06358..658e0d593a167 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,13 +24,8 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
-import org.json4s.JsonAST._
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods.{compact, render}
-
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
@@ -243,18 +238,13 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
                   sock.setSoTimeout(10000)
                   authHelper.authClient(sock)
                   val input = new DataInputStream(sock.getInputStream())
-                  val requestMethod = input.readInt()
-                  // The BarrierTaskContext function may wait infinitely, socket shall not timeout
-                  // before the function finishes.
-                  sock.setSoTimeout(0)
-                  requestMethod match {
+                  input.readInt() match {
                     case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-                      barrierAndServe(requestMethod, sock)
-                    case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
-                      val length = input.readInt()
-                      val message = new Array[Byte](length)
-                      input.readFully(message)
-                      barrierAndServe(requestMethod, sock, new String(message, UTF_8))
+                      // The barrier() function may wait infinitely, socket shall not timeout
+                      // before the function finishes.
+                      sock.setSoTimeout(0)
+                      barrierAndServe(sock)
+
                     case _ =>
                       val out = new DataOutputStream(new BufferedOutputStream(
                         sock.getOutputStream))
@@ -405,31 +395,15 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     }
 
     /**
-     * Gateway to call BarrierTaskContext methods.
+     * Gateway to call BarrierTaskContext.barrier().
      */
-    def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = {
-      require(
-        serverSocket.isDefined,
-        "No available ServerSocket to redirect the BarrierTaskContext method call."
-      )
+    def barrierAndServe(sock: Socket): Unit = {
+      require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.")
+
       val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
       try {
-        var result: String = ""
-        requestMethod match {
-          case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-            context.asInstanceOf[BarrierTaskContext].barrier()
-            result = BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS
-          case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
-            val messages: ArrayBuffer[String] = context.asInstanceOf[BarrierTaskContext].allGather(
-              message
-            )
-            result = compact(render(JArray(
-              messages.map(
-                (message) => JString(message)
-              ).toList
-            )))
-        }
-        writeUTF(result, out)
+        context.asInstanceOf[BarrierTaskContext].barrier()
+        writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out)
       } catch {
         case e: SparkException =>
           writeUTF(e.getMessage, out)
@@ -664,7 +638,6 @@ private[spark] object SpecialLengths {
 
 private[spark] object BarrierTaskContextMessageProtocol {
   val BARRIER_FUNCTION = 1
-  val ALL_GATHER_FUNCTION = 2
   val BARRIER_RESULT_SUCCESS = "success"
   val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side."
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
index ed38b7f0ecac1..fc8ac38479932 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.scheduler
 
 import java.io.File
 
-import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 import org.apache.spark._
@@ -53,79 +52,6 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext {
     assert(times.max - times.min <= 1000)
   }
 
-  test("share messages with allGather() call") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      // Pass partitionId message in
-      val message = context.partitionId().toString
-      val messages = context.allGather(message)
-      messages.toList.iterator
-    }
-    // Take a sorted list of all the partitionId messages
-    val messages = rdd2.collect().head
-    // All the task partitionIds are shared
-    for((x, i) <- messages.view.zipWithIndex) assert(x == i.toString)
-  }
-
-  test("throw exception if we attempt to synchronize with different blocking calls") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      val partitionId = context.partitionId
-      if (partitionId == 0) {
-        context.barrier()
-      } else {
-        context.allGather(partitionId.toString)
-      }
-      Seq(null).iterator
-    }
-    val error = intercept[SparkException] {
-      rdd2.collect()
-    }.getMessage
-    assert(error.contains("does not match the current synchronized requestMethod"))
-  }
-
-  test("successively sync with allGather and barrier") {
-    val conf = new SparkConf()
-      .setMaster("local-cluster[4, 1, 1024]")
-      .setAppName("test-cluster")
-    sc = new SparkContext(conf)
-    val rdd = sc.makeRDD(1 to 10, 4)
-    val rdd2 = rdd.barrier().mapPartitions { it =>
-      val context = BarrierTaskContext.get()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      context.barrier()
-      val time1 = System.currentTimeMillis()
-      // Sleep for a random time before global sync.
-      Thread.sleep(Random.nextInt(1000))
-      // Pass partitionId message in
-      val message = context.partitionId().toString
-      val messages = context.allGather(message)
-      val time2 = System.currentTimeMillis()
-      Seq((time1, time2)).iterator
-    }
-    val times = rdd2.collect()
-    // All the tasks shall finish the first round of global sync within a short time slot.
-    val times1 = times.map(_._1)
-    assert(times1.max - times1.min <= 1000)
-
-    // All the tasks shall finish the second round of global sync within a short time slot.
-    val times2 = times.map(_._2)
-    assert(times2.max - times2.min <= 1000)
-  }
-
   test("support multiple barrier() call within a single task") {
     initLocalClusterSparkContext()
     val rdd = sc.makeRDD(1 to 10, 4)
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index 90bd2345ac525..d648f63338514 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,10 +16,9 @@
 #
 
 from __future__ import print_function
-import json
 
 from pyspark.java_gateway import local_connect_and_auth
-from pyspark.serializers import write_int, write_with_length, UTF8Deserializer
+from pyspark.serializers import write_int, UTF8Deserializer
 
 
 class TaskContext(object):
@@ -108,28 +107,18 @@ def resources(self):
 
 
 BARRIER_FUNCTION = 1
-ALL_GATHER_FUNCTION = 2
 
 
-def _load_from_socket(port, auth_secret, function, all_gather_message=None):
+def _load_from_socket(port, auth_secret):
     """
     Load data from a given socket, this is a blocking method thus only return when the socket
     connection has been closed.
     """
     (sockfile, sock) = local_connect_and_auth(port, auth_secret)
-
-    # The call may block forever, so no timeout
+    # The barrier() call may block forever, so no timeout
     sock.settimeout(None)
-
-    if function == BARRIER_FUNCTION:
-        # Make a barrier() function call.
-        write_int(function, sockfile)
-    elif function == ALL_GATHER_FUNCTION:
-        # Make a all_gather() function call.
-        write_int(function, sockfile)
-        write_with_length(all_gather_message.encode("utf-8"), sockfile)
-    else:
-        raise ValueError("Unrecognized function type")
+    # Make a barrier() function call.
+    write_int(BARRIER_FUNCTION, sockfile)
     sockfile.flush()
 
     # Collect result.
@@ -210,33 +199,7 @@ def barrier(self):
             raise Exception("Not supported to call barrier() before initialize " +
                             "BarrierTaskContext.")
         else:
-            _load_from_socket(self._port, self._secret, BARRIER_FUNCTION)
-
-    def allGather(self, message=""):
-        """
-        .. note:: Experimental
-
-        This function blocks until all tasks in the same stage have reached this routine.
-        Each task passes in a message and returns with a list of all the messages passed in
-        by each of those tasks.
-
-        .. warning:: In a barrier stage, each task much have the same number of `allGather()`
-            calls, in all possible code branches.
-            Otherwise, you may get the job hanging or a SparkException after timeout.
-        """
-        if not isinstance(message, str):
-            raise ValueError("Argument `message` must be of type `str`")
-        elif self._port is None or self._secret is None:
-            raise Exception("Not supported to call barrier() before initialize " +
-                            "BarrierTaskContext.")
-        else:
-            gathered_items = _load_from_socket(
-                self._port,
-                self._secret,
-                ALL_GATHER_FUNCTION,
-                message,
-            )
-            return [e for e in json.loads(gathered_items)]
+            _load_from_socket(self._port, self._secret)
 
     def getTaskInfos(self):
         """
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 0053aadd9c3ed..68cfe814762e0 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -135,26 +135,6 @@ def context_barrier(x):
         times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
         self.assertTrue(max(times) - min(times) < 1)
 
-    def test_all_gather(self):
-        """
-        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
-        within a stage and passes messages properly.
-        """
-        rdd = self.sc.parallelize(range(10), 4)
-
-        def f(iterator):
-            yield sum(iterator)
-
-        def context_barrier(x):
-            tc = BarrierTaskContext.get()
-            time.sleep(random.randint(1, 10))
-            out = tc.allGather(str(context.partitionId()))
-            pids = [int(e) for e in out]
-            return [pids]
-
-        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
-        self.assertTrue(pids == [0, 1, 2, 3])
-
     def test_barrier_infos(self):
         """
         Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the

From 96c1a4401d96688a3142082f89864ded9bd22f7a Mon Sep 17 00:00:00 2001
From: Alex Favaro <alex.favaro@affirm.com>
Date: Thu, 20 Feb 2020 12:21:24 +0900
Subject: [PATCH 100/185] [SPARK-30856][SQL][PYSPARK] Fix
 SQLContext.getOrCreate() when SparkContext is restarted

### What changes were proposed in this pull request?

As discussed on the Jira ticket, this change clears the SQLContext._instantiatedContext class attribute when the SparkSession is stopped. That way, the attribute will be reset with a new, usable SQLContext when a new SparkSession is started.

### Why are the changes needed?

When the underlying SQLContext is instantiated for a SparkSession, the instance is saved as a class attribute and returned from subsequent calls to SQLContext.getOrCreate(). If the SparkContext is stopped and a new one started, the SQLContext class attribute is never cleared so any code which calls SQLContext.getOrCreate() will get a SQLContext with a reference to the old, unusable SparkContext.

A similar issue was identified and fixed for SparkSession in [SPARK-19055](https://issues.apache.org/jira/browse/SPARK-19055), but the fix did not change SQLContext as well. I ran into this because mllib still [uses](https://github.com/apache/spark/blob/master/python/pyspark/mllib/common.py#L105) SQLContext.getOrCreate() under the hood.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

A new test was added. I verified that the test fails without the included change.

Closes #27610 from afavaro/restart-sqlcontext.

Authored-by: Alex Favaro <alex.favaro@affirm.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/sql/context.py            |  6 ++--
 python/pyspark/sql/session.py            |  2 ++
 python/pyspark/sql/tests/test_context.py |  1 -
 python/pyspark/sql/tests/test_session.py | 46 ++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index f203e1cb55588..e9f6a62feb6d2 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -87,7 +87,8 @@ def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
         self._jsqlContext = jsqlContext
         _monkey_patch_RDD(self.sparkSession)
         install_exception_handler()
-        if SQLContext._instantiatedContext is None:
+        if (SQLContext._instantiatedContext is None
+                or SQLContext._instantiatedContext._sc._jsc is None):
             SQLContext._instantiatedContext = self
 
     @property
@@ -118,7 +119,8 @@ def getOrCreate(cls, sc):
             "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
             DeprecationWarning)
 
-        if cls._instantiatedContext is None:
+        if (cls._instantiatedContext is None
+                or SQLContext._instantiatedContext._sc._jsc is None):
             jsqlContext = sc._jvm.SparkSession.builder().sparkContext(
                 sc._jsc.sc()).getOrCreate().sqlContext()
             sparkSession = SparkSession(sc, jsqlContext.sparkSession())
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 233f4927389d2..be4fa20a04327 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -699,12 +699,14 @@ def streams(self):
     def stop(self):
         """Stop the underlying :class:`SparkContext`.
         """
+        from pyspark.sql.context import SQLContext
         self._sc.stop()
         # We should clean the default session up. See SPARK-23228.
         self._jvm.SparkSession.clearDefaultSession()
         self._jvm.SparkSession.clearActiveSession()
         SparkSession._instantiatedSession = None
         SparkSession._activeSession = None
+        SQLContext._instantiatedContext = None
 
     @since(2.0)
     def __enter__(self):
diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
index 92e54340f8143..d4a476dd36371 100644
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@@ -270,7 +270,6 @@ def test_get_or_create(self):
             sql_context = SQLContext.getOrCreate(sc)
             assert(isinstance(sql_context, SQLContext))
         finally:
-            SQLContext._instantiatedContext = None
             if sql_context is not None:
                 sql_context.sparkSession.stop()
             if sc is not None:
diff --git a/python/pyspark/sql/tests/test_session.py b/python/pyspark/sql/tests/test_session.py
index f7b6585db7d43..5e4166e6f8e71 100644
--- a/python/pyspark/sql/tests/test_session.py
+++ b/python/pyspark/sql/tests/test_session.py
@@ -225,6 +225,52 @@ def test_get_active_session_after_create_dataframe(self):
                 session2.stop()
 
 
+class SparkSessionTests5(unittest.TestCase):
+
+    def setUp(self):
+        # These tests require restarting the Spark context so we set up a new one for each test
+        # rather than at the class level.
+        self.sc = SparkContext('local[4]', self.__class__.__name__, conf=SparkConf())
+        self.spark = SparkSession(self.sc)
+
+    def tearDown(self):
+        self.sc.stop()
+        self.spark.stop()
+
+    def test_sqlcontext_with_stopped_sparksession(self):
+        # SPARK-30856: test that SQLContext.getOrCreate() returns a usable instance after
+        # the SparkSession is restarted.
+        sql_context = self.spark._wrapped
+        self.spark.stop()
+        sc = SparkContext('local[4]', self.sc.appName)
+        spark = SparkSession(sc)  # Instantiate the underlying SQLContext
+        new_sql_context = spark._wrapped
+
+        self.assertIsNot(new_sql_context, sql_context)
+        self.assertIs(SQLContext.getOrCreate(sc).sparkSession, spark)
+        try:
+            df = spark.createDataFrame([(1, 2)], ['c', 'c'])
+            df.collect()
+        finally:
+            spark.stop()
+            self.assertIsNone(SQLContext._instantiatedContext)
+            sc.stop()
+
+    def test_sqlcontext_with_stopped_sparkcontext(self):
+        # SPARK-30856: test initialization via SparkSession when only the SparkContext is stopped
+        self.sc.stop()
+        self.sc = SparkContext('local[4]', self.sc.appName)
+        self.spark = SparkSession(self.sc)
+        self.assertIs(SQLContext.getOrCreate(self.sc).sparkSession, self.spark)
+
+    def test_get_sqlcontext_with_stopped_sparkcontext(self):
+        # SPARK-30856: test initialization via SQLContext.getOrCreate() when only the SparkContext
+        # is stopped
+        self.sc.stop()
+        self.sc = SparkContext('local[4]', self.sc.appName)
+        self.assertIs(SQLContext.getOrCreate(self.sc)._sc, self.sc)
+
+
 class SparkSessionBuilderTests(unittest.TestCase):
 
     def test_create_spark_context_first_then_spark_session(self):

From ef90f1422fcbbf85253580a7ef1e97f5411314e1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 20 Feb 2020 13:07:52 +0800
Subject: [PATCH 101/185] [SPARK-30878][SQL][DOC] Improve the CREATE TABLE
 document

### What changes were proposed in this pull request?

Improve the CREATE TABLE document:
1. mention that some clauses can come in as any order.
2. refine the description for some parameters.
3. mention how data source table interacts with data source
4. make the examples consistent between data source and hive serde tables.

### Why are the changes needed?

improve doc

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #27638 from cloud-fan/doc.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 ...-ref-syntax-ddl-create-table-datasource.md | 49 ++++++++++++++-----
 ...-ref-syntax-ddl-create-table-hiveformat.md | 47 ++++++++++++------
 docs/sql-ref-syntax-ddl-create-table.md       |  4 +-
 3 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/docs/sql-ref-syntax-ddl-create-table-datasource.md b/docs/sql-ref-syntax-ddl-create-table-datasource.md
index 9b15c0865b38a..532377d7fcec3 100644
--- a/docs/sql-ref-syntax-ddl-create-table-datasource.md
+++ b/docs/sql-ref-syntax-ddl-create-table-datasource.md
@@ -27,7 +27,7 @@ The `CREATE TABLE` statement defines a new table using a Data Source.
 {% highlight sql %}
 CREATE TABLE [ IF NOT EXISTS ] table_identifier
   [ ( col_name1 col_type1 [ COMMENT col_comment1 ], ... ) ]
-  USING data_source
+  [USING data_source]
   [ OPTIONS ( key1=val1, key2=val2, ... ) ]
   [ PARTITIONED BY ( col_name1, col_name2, ... ) ]
   [ CLUSTERED BY ( col_name3, col_name4, ... ) 
@@ -39,6 +39,9 @@ CREATE TABLE [ IF NOT EXISTS ] table_identifier
   [ AS select_statement ]
 {% endhighlight %}
 
+Note that, the clauses between the USING clause and the AS SELECT clause can come in
+as any order. For example, you can write COMMENT table_comment after TBLPROPERTIES.
+
 ### Parameters
 
 <dl>
@@ -78,13 +81,12 @@ CREATE TABLE [ IF NOT EXISTS ] table_identifier
 
 <dl>
   <dt><code><em>COMMENT</em></code></dt>
-  <dd>Table comments are added.</dd>
+  <dd>A string literal to describe the table.</dd>
 </dl>
 
 <dl>
   <dt><code><em>TBLPROPERTIES</em></code></dt>
-  <dd>Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc.
-  </dd>
+  <dd>A list of key-value pairs that is used to tag the table definition.</dd>
 </dl>
 
 <dl>
@@ -92,18 +94,43 @@ CREATE TABLE [ IF NOT EXISTS ] table_identifier
   <dd>The table is populated using the data from the select statement.</dd>
 </dl>
 
+### Data Source Interaction
+A Data Source table acts like a pointer to the underlying data source. For example, you can create
+a table "foo" in Spark which points to a table "bar" in MySQL using JDBC Data Source. When you
+read/write table "foo", you actually read/write table "bar".
+ 
+In general CREATE TABLE is creating a "pointer", and you need to make sure it points to something
+existing. An exception is file source such as parquet, json. If you don't specify the LOCATION,
+Spark will create a default table location for you.
+
+For CREATE TABLE AS SELECT, Spark will overwrite the underlying data source with the data of the
+input query, to make sure the table gets created contains exactly the same data as the input query.
+
 ### Examples
 {% highlight sql %}
 
---Using data source
-CREATE TABLE Student (Id INT,name STRING ,age INT) USING CSV;
+--Use data source
+CREATE TABLE student (id INT, name STRING, age INT) USING CSV;
+
+--Use data from another table
+CREATE TABLE student_copy USING CSV
+  AS SELECT * FROM student;
+  
+--Omit the USING clause, which uses the default data source (parquet by default)
+CREATE TABLE student (id INT, name STRING, age INT);
+
+--Specify table comment and properties
+CREATE TABLE student (id INT, name STRING, age INT) USING CSV
+  COMMENT 'this is a comment'
+  TBLPROPERTIES ('foo'='bar');
 
---Using data from another table
-CREATE TABLE StudentInfo
-  AS SELECT * FROM Student;
+--Specify table comment and properties with different clauses order
+CREATE TABLE student (id INT, name STRING, age INT) USING CSV
+  TBLPROPERTIES ('foo'='bar')
+  COMMENT 'this is a comment';
 
---Partitioned and bucketed
-CREATE TABLE Student (Id INT,name STRING ,age INT)
+--Create partitioned and bucketed table
+CREATE TABLE student (id INT, name STRING, age INT)
   USING CSV
   PARTITIONED BY (age)
   CLUSTERED BY (Id) INTO 4 buckets;
diff --git a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md
index 78b7d0581e985..0425bafd94398 100644
--- a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md
+++ b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md
@@ -37,6 +37,9 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier
 
 {% endhighlight %}
 
+Note that, the clauses between the columns definition clause and the AS SELECT clause can come in
+as any order. For example, you can write COMMENT table_comment after TBLPROPERTIES.
+
 ### Parameters
 
 <dl>
@@ -77,14 +80,12 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier
 
 <dl>
   <dt><code><em>COMMENT</em></code></dt>
-  <dd>Table comments are added.</dd>
+  <dd>A string literal to describe the table.</dd>
 </dl>
 
 <dl>
   <dt><code><em>TBLPROPERTIES</em></code></dt>
-  <dd>
-	Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc.
-  </dd>
+  <dd>A list of key-value pairs that is used to tag the table definition.</dd>
 </dl>
 
 <dl>
@@ -96,21 +97,37 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier
 ### Examples
 {% highlight sql %}
 
---Using Comment and loading data from another table into the created table
-CREATE TABLE StudentInfo
-  COMMENT 'Table is created using existing data'
-  AS SELECT * FROM Student;
+--Use hive format
+CREATE TABLE student (id INT, name STRING, age INT) STORED AS ORC;
+
+--Use data from another table
+CREATE TABLE student_copy STORED AS ORC
+  AS SELECT * FROM student;
+
+--Specify table comment and properties
+CREATE TABLE student (id INT, name STRING, age INT)
+  COMMENT 'this is a comment'
+  STORED AS ORC
+  TBLPROPERTIES ('foo'='bar');  
+
+--Specify table comment and properties with different clauses order
+CREATE TABLE student (id INT, name STRING, age INT)
+  STORED AS ORC
+  TBLPROPERTIES ('foo'='bar')
+  COMMENT 'this is a comment';
 
---Partitioned table
-CREATE TABLE Student (Id INT,name STRING)
+--Create partitioned table
+CREATE TABLE student (id INT, name STRING)
   PARTITIONED BY (age INT)
-  TBLPROPERTIES ('owner'='xxxx');
+  STORED AS ORC;
 
-CREATE TABLE Student (Id INT,name STRING,age INT)
-  PARTITIONED BY (name,age);
+--Create partitioned table with different clauses order
+CREATE TABLE student (id INT, name STRING)
+  STORED AS ORC
+  PARTITIONED BY (age INT);
 
---Using Row Format and file format
-CREATE TABLE Student (Id INT,name STRING)
+--Use Row Format and file format
+CREATE TABLE student (id INT,name STRING)
   ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
   STORED AS TEXTFILE;
 
diff --git a/docs/sql-ref-syntax-ddl-create-table.md b/docs/sql-ref-syntax-ddl-create-table.md
index c0e58a41cf5cc..20aff6fb823cb 100644
--- a/docs/sql-ref-syntax-ddl-create-table.md
+++ b/docs/sql-ref-syntax-ddl-create-table.md
@@ -20,10 +20,10 @@ license: |
 ---
 
 ### Description
-`CREATE TABLE` statement is used to define a table in an exsisting database. 
+`CREATE TABLE` statement is used to define a table in an existing database. 
 
 The CREATE statements:
-* [CREATE TABLE USING DATASOURCE](sql-ref-syntax-ddl-create-table-datasource.html)
+* [CREATE TABLE USING DATA_SOURCE](sql-ref-syntax-ddl-create-table-datasource.html)
 * [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html)
 * [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html)
 

From 7c4ad6316e3fe4a58629771a99faaa986c866c0b Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Wed, 19 Feb 2020 23:01:49 -0800
Subject: [PATCH 102/185] [SPARK-29148][CORE][FOLLOW-UP] Don't dynamic
 allocation warning when it's disabled

### What changes were proposed in this pull request?

Currently, after https://github.com/apache/spark/pull/27313, it shows the warning about dynamic allocation which is disabled by default.

```bash
$ ./bin/spark-shell
```

```
...
20/02/18 11:04:56 WARN ResourceProfile: Please ensure that the number of slots available on your executors is
limited by the number of cores to task cpus and not another custom resource. If cores is not the limiting resource
then dynamic allocation will not work properly!
```

This PR brings back the configuration checking for this warning. Seems mistakenly removed at https://github.com/apache/spark/pull/27313/files#diff-364713d7776956cb8b0a771e9b62f82dL2841

### Why are the changes needed?

To remove false warning.

### Does this PR introduce any user-facing change?

Yes, it will don't show the warning. It's master only change so no user-facing to end users.

### How was this patch tested?

Manually tested.

Closes #27615 from HyukjinKwon/SPARK-29148.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../main/scala/org/apache/spark/resource/ResourceProfile.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
index 03dcf5e317798..2608ab9fc0d4c 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
@@ -183,7 +183,7 @@ class ResourceProfile(
           "no corresponding task resource request was specified.")
       }
     }
-    if(!shouldCheckExecCores) {
+    if(!shouldCheckExecCores && Utils.isDynamicAllocationEnabled(sparkConf)) {
       // if we can't rely on the executor cores config throw a warning for user
       logWarning("Please ensure that the number of slots available on your " +
         "executors is limited by the number of cores to task cpus and not another " +

From 92d5d40c8efffd90eb04308b6dad77d3d1e1be14 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Thu, 20 Feb 2020 00:59:22 -0800
Subject: [PATCH 103/185] [SPARK-30881][SQL][DOCS] Revise the doc of
 spark.sql.sources.parallelPartitionDiscovery.threshold

### What changes were proposed in this pull request?

Revise the doc of SQL configuration `spark.sql.sources.parallelPartitionDiscovery.threshold`.
### Why are the changes needed?

The doc of configuration "spark.sql.sources.parallelPartitionDiscovery.threshold" is not accurate on the part "This applies to Parquet, ORC, CSV, JSON and LibSVM data sources".

We should revise it as effective on all the file-based data sources.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

None. It's just doc.

Closes #27639 from gengliangwang/reviseParallelPartitionDiscovery.

Authored-by: Gengliang Wang <gengliang.wang@databricks.com>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e1492e646a627..05b33a073a4ed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -878,8 +878,8 @@ object SQLConf {
     buildConf("spark.sql.sources.parallelPartitionDiscovery.threshold")
       .doc("The maximum number of paths allowed for listing files at driver side. If the number " +
         "of detected paths exceeds this value during partition discovery, it tries to list the " +
-        "files with another Spark distributed job. This applies to Parquet, ORC, CSV, JSON and " +
-        "LibSVM data sources.")
+        "files with another Spark distributed job. This configuration is effective only when " +
+        "using file-based sources such as Parquet, JSON and ORC.")
       .intConf
       .checkValue(parallel => parallel >= 0, "The maximum number of paths allowed for listing " +
         "files at driver side must not be negative")

From 4248b7fbb9fd53b5d95bb60fbcec1248052d2879 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Thu, 20 Feb 2020 21:26:20 +0800
Subject: [PATCH 104/185] [SPARK-30858][SQL] Make IntegralDivide's dataType
 independent from SQL config changes

### What changes were proposed in this pull request?
In the PR, I propose to add the `returnLong` parameter to `IntegralDivide`, and pass the value of `spark.sql.legacy.integralDivide.returnBigint` if `returnLong` is not provided on creation of `IntegralDivide`.

### Why are the changes needed?
This allows to avoid the issue when the configuration change between different phases of planning, and this can silently break a query plan which can lead to crashes or data corruption.

OptionsAttachments

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By `ArithmeticExpressionSuite`.

Closes #27628 from MaxGekk/integral-divide-conf.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/analysis/DecimalPrecision.scala  |  8 ++++---
 .../sql/catalyst/expressions/arithmetic.scala | 21 +++++++++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index a64befecb68d6..91489b3e46ff3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -175,10 +175,12 @@ object DecimalPrecision extends TypeCoercionRule {
         resultType, nullOnOverflow)
 
     case expr @ IntegralDivide(
-        e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
+        e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2), returnLong) =>
       val widerType = widerDecimalType(p1, s1, p2, s2)
-      val promotedExpr =
-        IntegralDivide(promotePrecision(e1, widerType), promotePrecision(e2, widerType))
+      val promotedExpr = IntegralDivide(
+        promotePrecision(e1, widerType),
+        promotePrecision(e2, widerType),
+        returnLong)
       if (expr.dataType.isInstanceOf[DecimalType]) {
         // This follows division rule
         val intDig = p1 - s1 + s2
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index debd7c89adb9f..624891c619d41 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -403,11 +403,18 @@ case class Divide(left: Expression, right: Expression) extends DivModLike {
   """,
   since = "3.0.0")
 // scalastyle:on line.size.limit
-case class IntegralDivide(left: Expression, right: Expression) extends DivModLike {
+case class IntegralDivide(
+    left: Expression,
+    right: Expression,
+    returnLong: Boolean) extends DivModLike {
+
+  def this(left: Expression, right: Expression) = {
+    this(left, right, SQLConf.get.integralDivideReturnLong)
+  }
 
   override def inputType: AbstractDataType = TypeCollection(IntegralType, DecimalType)
 
-  override def dataType: DataType = if (SQLConf.get.integralDivideReturnLong) {
+  override def dataType: DataType = if (returnLong) {
     LongType
   } else {
     left.dataType
@@ -416,7 +423,7 @@ case class IntegralDivide(left: Expression, right: Expression) extends DivModLik
   override def symbol: String = "/"
   override def decimalMethod: String = "quot"
   override def decimalToDataTypeCodeGen(decimalResult: String): String = {
-    if (SQLConf.get.integralDivideReturnLong) {
+    if (returnLong) {
       s"$decimalResult.toLong()"
     } else {
       decimalResult
@@ -433,7 +440,7 @@ case class IntegralDivide(left: Expression, right: Expression) extends DivModLik
         d.asIntegral.asInstanceOf[Integral[Any]]
     }
     val divide = integral.quot _
-    if (SQLConf.get.integralDivideReturnLong) {
+    if (returnLong) {
       val toLong = integral.asInstanceOf[Integral[Any]].toLong _
       (x, y) => {
         val res = divide(x, y)
@@ -451,6 +458,12 @@ case class IntegralDivide(left: Expression, right: Expression) extends DivModLik
   override def evalOperation(left: Any, right: Any): Any = div(left, right)
 }
 
+object IntegralDivide {
+  def apply(left: Expression, right: Expression): IntegralDivide = {
+    new IntegralDivide(left, right)
+  }
+}
+
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.",
   examples = """

From 704d249a56325fce4a8179a2a7a242b9469aa6ec Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 20 Feb 2020 22:10:04 +0800
Subject: [PATCH 105/185] [SPARK-26071][FOLLOWUP] Improve migration guide of
 disallowing map type map key

### What changes were proposed in this pull request?

mention the workaround if users do want to use map type as key, and add a test to demonstrate it.

### Why are the changes needed?

it's better to provide an alternative when we ban something.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #27621 from cloud-fan/map.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                               | 2 +-
 .../org/apache/spark/sql/DataFrameFunctionsSuite.scala    | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 0690127a45b17..9b74b456b88e7 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -43,7 +43,7 @@ license: |
 
   - The `ADD JAR` command previously returned a result set with the single value 0. It now returns an empty result set.
 
-  - In Spark version 2.4 and earlier, users can create map values with map type key via built-in function like `CreateMap`, `MapFromArrays`, etc. Since Spark 3.0, it's not allowed to create map values with map type key with these built-in functions. Users can still read map values with map type key from data source or Java/Scala collections, though they are not very useful.
+  - In Spark version 2.4 and earlier, users can create map values with map type key via built-in function such as `CreateMap`, `MapFromArrays`, etc. Since Spark 3.0, it's not allowed to create map values with map type key with these built-in functions. Users can use `map_entries` function to convert map to array<struct<key, value>> as a workaround. In addition, users can still read map values with map type key from data source or Java/Scala collections, though it is discouraged.
 
   - In Spark version 2.4 and earlier, `Dataset.groupByKey` results to a grouped dataset with key attribute wrongly named as "value", if the key is non-struct type, e.g. int, string, array, etc. This is counterintuitive and makes the schema of aggregation queries weird. For example, the schema of `ds.groupByKey(...).count()` is `(value, count)`. Since Spark 3.0, we name the grouping attribute to "key". The old behaviour is preserved under a newly added configuration `spark.sql.legacy.dataset.nameNonStructGroupingKeyAsValue` with a default value of `false`.
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 341b3255b12bf..b4b9a488b11c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3584,6 +3584,14 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
         }
     }
   }
+
+  test("SPARK-26071: convert map to array and use as map key") {
+    val df = Seq(Map(1 -> "a")).toDF("m")
+    intercept[AnalysisException](df.select(map($"m", lit(1))))
+    checkAnswer(
+      df.select(map(map_entries($"m"), lit(1))),
+      Row(Map(Seq(Row(1, "a")) -> 1)))
+  }
 }
 
 object DataFrameFunctionsSuite {

From a551715fd26779a0e02534d81d1c3f0804152fec Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Fri, 21 Feb 2020 00:00:48 +0800
Subject: [PATCH 106/185] [SPARK-29930][SPARK-30416][SQL][FOLLOWUP] Move
 deprecated/removed config checks from RuntimeConfig to SQLConf

### What changes were proposed in this pull request?
- Output warnings for deprecated SQL configs in `SQLConf. setConfWithCheck()` and in `SQLConf. unsetConf()`
- Throw an exception for removed SQL configs in `SQLConf. setConfWithCheck()` when they set to non-default values
- Remove checking of deprecated and removed SQL configs from RuntimeConfig

### Why are the changes needed?
Currently, warnings/exceptions are printed only when a SQL config is set dynamically, for instance via `spark.conf.set()`. After the changes, removed/deprecated SQL configs will be checked when they set statically. For example:
```
$ bin/spark-shell --conf spark.sql.fromJsonForceNullableSchema=false
scala> spark.emptyDataFrame
java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
...
Caused by: org.apache.spark.sql.AnalysisException: The SQL config 'spark.sql.fromJsonForceNullableSchema' was removed in the version 3.0.0. It was removed to prevent errors like SPARK-23173 for non-default value.
```
```
$ bin/spark-shell --conf spark.sql.hive.verifyPartitionPath=false
scala> spark.emptyDataFrame
20/02/20 02:10:26 WARN SQLConf: The SQL config 'spark.sql.hive.verifyPartitionPath' has been deprecated in Spark v3.0 and may be removed in the future. This config is replaced by 'spark.files.ignoreMissingFiles'.
```

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
By `SQLConfSuite`

Closes #27645 from MaxGekk/remove-sql-configs-followup-2.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   | 28 ++++++++++++++++++-
 .../org/apache/spark/sql/RuntimeConfig.scala  | 27 +-----------------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 05b33a073a4ed..54a7b218665ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -33,6 +33,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.{IGNORE_MISSING_FILES => SPARK_IGNORE_MISSING_FILES}
 import org.apache.spark.network.util.ByteUnit
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.{HintErrorLogger, Resolver}
 import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
@@ -2899,16 +2900,41 @@ class SQLConf extends Serializable with Logging {
     settings.containsKey(key)
   }
 
+  /**
+   * Logs a warning message if the given config key is deprecated.
+   */
+  private def logDeprecationWarning(key: String): Unit = {
+    SQLConf.deprecatedSQLConfigs.get(key).foreach {
+      case DeprecatedConfig(configName, version, comment) =>
+        logWarning(
+          s"The SQL config '$configName' has been deprecated in Spark v$version " +
+          s"and may be removed in the future. $comment")
+    }
+  }
+
+  private def requireDefaultValueOfRemovedConf(key: String, value: String): Unit = {
+    SQLConf.removedSQLConfigs.get(key).foreach {
+      case RemovedConfig(configName, version, defaultValue, comment) =>
+        if (value != defaultValue) {
+          throw new AnalysisException(
+            s"The SQL config '$configName' was removed in the version $version. $comment")
+        }
+    }
+  }
+
   protected def setConfWithCheck(key: String, value: String): Unit = {
+    logDeprecationWarning(key)
+    requireDefaultValueOfRemovedConf(key, value)
     settings.put(key, value)
   }
 
   def unsetConf(key: String): Unit = {
+    logDeprecationWarning(key)
     settings.remove(key)
   }
 
   def unsetConf(entry: ConfigEntry[_]): Unit = {
-    settings.remove(entry.key)
+    unsetConf(entry.key)
   }
 
   def clear(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
index e1b44b5918143..e9bc25d489718 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.internal.SQLConf.{DeprecatedConfig, RemovedConfig}
  * @since 2.0.0
  */
 @Stable
-class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging {
+class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) {
 
   /**
    * Sets the given Spark runtime configuration property.
@@ -40,8 +40,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging
    */
   def set(key: String, value: String): Unit = {
     requireNonStaticConf(key)
-    requireDefaultValueOfRemovedConf(key, value)
-    logDeprecationWarning(key)
     sqlConf.setConfString(key, value)
   }
 
@@ -130,7 +128,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging
    */
   def unset(key: String): Unit = {
     requireNonStaticConf(key)
-    logDeprecationWarning(key)
     sqlConf.unsetConf(key)
   }
 
@@ -161,26 +158,4 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging
       throw new AnalysisException(s"Cannot modify the value of a Spark config: $key")
     }
   }
-
-  private def requireDefaultValueOfRemovedConf(key: String, value: String): Unit = {
-    SQLConf.removedSQLConfigs.get(key).foreach {
-      case RemovedConfig(configName, version, defaultValue, comment) =>
-        if (value != defaultValue) {
-          throw new AnalysisException(
-            s"The SQL config '$configName' was removed in the version $version. $comment")
-        }
-    }
-  }
-
-  /**
-   * Logs a warning message if the given config key is deprecated.
-   */
-  private def logDeprecationWarning(key: String): Unit = {
-    SQLConf.deprecatedSQLConfigs.get(key).foreach {
-      case DeprecatedConfig(configName, version, comment) =>
-        logWarning(
-          s"The SQL config '$configName' has been deprecated in Spark v$version " +
-          s"and may be removed in the future. $comment")
-    }
-  }
 }

From bb40ab09f4bcb0fd94b0649442cc0daa1162207d Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Fri, 21 Feb 2020 00:44:09 +0800
Subject: [PATCH 107/185] [SPARK-30892][SQL] Exclude
 `spark.sql.variable.substitute.depth` from `removedSQLConfigs`

### What changes were proposed in this pull request?

Exclude the SQL config `spark.sql.variable.substitute.depth` from `SQLConf.removedSQLConfigs`

### Why are the changes needed?
By the #27169, the config was placed to `SQLConf.removedSQLConfigs`. And as a consequence of that when an user set it non-default value (1 for example),  he/she will get an exception. It is acceptable for SQL configs that could impact on the behavior but not for this particular config. Raising of such exception will just make migration to Spark 3.0 more difficult.

### Does this PR introduce any user-facing change?
Yes, before the changes users get an exception when he/she set `spark.sql.variable.substitute.depth` to a value different from `40`.

### How was this patch tested?
Run `spark.conf.set("spark.sql.variable.substitute.depth", 1)` in `spark-shell`.

Closes #27646 from MaxGekk/remove-substitute-depth-conf.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 54a7b218665ba..d756014c393cd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2264,8 +2264,6 @@ object SQLConf {
         "It was removed to prevent loosing of users data for non-default value."),
       RemovedConfig("spark.sql.legacy.compareDateTimestampInTimestamp", "3.0.0", "true",
         "It was removed to prevent errors like SPARK-23549 for non-default value."),
-      RemovedConfig("spark.sql.variable.substitute.depth", "3.0.0", "40",
-        "It was deprecated since Spark 2.1, and not used in Spark 2.4."),
       RemovedConfig("spark.sql.parquet.int64AsTimestampMillis", "3.0.0", "false",
         "The config was deprecated since Spark 2.3." +
         s"Use '${PARQUET_OUTPUT_TIMESTAMP_TYPE.key}' instead of it."),

From fc4e56a54c15e20baf085e6061d3d83f5ce1185d Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Thu, 20 Feb 2020 09:08:33 -0800
Subject: [PATCH 108/185] [SPARK-30884][PYSPARK] Upgrade to Py4J 0.10.9

This PR aims to upgrade Py4J to `0.10.9` for better Python 3.7 support in Apache Spark 3.0.0 (master/branch-3.0). This is not for `branch-2.4`.

- Apache Spark 3.0.0 is using `Py4J 0.10.8.1` (released on 2018-10-21) because `0.10.8.1` was the first official release to support Python 3.7.
    - https://www.py4j.org/changelog.html#py4j-0-10-8-and-py4j-0-10-8-1
- `Py4J 0.10.9` was released on January 25th 2020 with better Python 3.7 support and `magic_member` bug fix.
    - https://github.com/bartdag/py4j/releases/tag/0.10.9
    - https://www.py4j.org/changelog.html#py4j-0-10-9

No.

Pass the Jenkins with the existing tests.

Closes #27641 from dongjoon-hyun/SPARK-30884.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 bin/pyspark                                     |   2 +-
 bin/pyspark2.cmd                                |   2 +-
 core/pom.xml                                    |   2 +-
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 dev/deps/spark-deps-hadoop-2.7-hive-1.2         |   2 +-
 dev/deps/spark-deps-hadoop-2.7-hive-2.3         |   2 +-
 dev/deps/spark-deps-hadoop-3.2-hive-2.3         |   2 +-
 python/README.md                                |   2 +-
 python/docs/Makefile                            |   2 +-
 python/docs/make2.bat                           |   2 +-
 python/lib/py4j-0.10.8.1-src.zip                | Bin 41255 -> 0 bytes
 python/lib/py4j-0.10.9-src.zip                  | Bin 0 -> 41587 bytes
 python/setup.py                                 |   2 +-
 sbin/spark-config.sh                            |   2 +-
 14 files changed, 12 insertions(+), 12 deletions(-)
 delete mode 100644 python/lib/py4j-0.10.8.1-src.zip
 create mode 100644 python/lib/py4j-0.10.9-src.zip

diff --git a/bin/pyspark b/bin/pyspark
index 44891aee2e0a3..ad4132fb59eb0 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.8.1-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 479fd464c7d3e..dc34be1a41706 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.8.1-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index 9d54d21b95ba3..11faf6b1b0e38 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -414,7 +414,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.10.8.1</version>
+      <version>0.10.9</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 62d60475985b3..490b48719b6be 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -27,7 +27,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 
 private[spark] object PythonUtils {
-  val PY4J_ZIP_NAME = "py4j-0.10.8.1-src.zip"
+  val PY4J_ZIP_NAME = "py4j-0.10.9-src.zip"
 
   /** Get the PYTHONPATH for PySpark, either from SPARK_HOME, if it is set, or from our JAR */
   def sparkPythonPath: String = {
diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 b/dev/deps/spark-deps-hadoop-2.7-hive-1.2
index 534ac39e0c46e..247798547f79a 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-1.2
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2
@@ -174,7 +174,7 @@ parquet-hadoop-bundle/1.6.0//parquet-hadoop-bundle-1.6.0.jar
 parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar
 parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
-py4j/0.10.8.1//py4j-0.10.8.1.jar
+py4j/0.10.9//py4j-0.10.9.jar
 pyrolite/4.30//pyrolite-4.30.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
 scala-compiler/2.12.10//scala-compiler-2.12.10.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
index c50cf96dc9065..60883a58957db 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
@@ -188,7 +188,7 @@ parquet-format/2.4.0//parquet-format-2.4.0.jar
 parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar
 parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
-py4j/0.10.8.1//py4j-0.10.8.1.jar
+py4j/0.10.9//py4j-0.10.9.jar
 pyrolite/4.30//pyrolite-4.30.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
 scala-compiler/2.12.10//scala-compiler-2.12.10.jar
diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
index c37ce7fab36f6..944415e06f3ba 100644
--- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
@@ -203,7 +203,7 @@ parquet-format/2.4.0//parquet-format-2.4.0.jar
 parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar
 parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
-py4j/0.10.8.1//py4j-0.10.8.1.jar
+py4j/0.10.9//py4j-0.10.9.jar
 pyrolite/4.30//pyrolite-4.30.jar
 re2j/1.1//re2j-1.1.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
diff --git a/python/README.md b/python/README.md
index 430efe5d724db..7e2cd1396aaf1 100644
--- a/python/README.md
+++ b/python/README.md
@@ -29,4 +29,4 @@ The Python packaging for Spark is not intended to replace all of the other use c
 
 ## Python Requirements
 
-At its core PySpark depends on Py4J (currently version 0.10.8.1), but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow).
+At its core PySpark depends on Py4J, but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow).
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 66d3fc425daa3..b11e3589065c3 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   ?= sphinx-build
 PAPER         ?=
 BUILDDIR      ?= _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.8.1-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/docs/make2.bat b/python/docs/make2.bat
index 742df373166da..2d7d929755a04 100644
--- a/python/docs/make2.bat
+++ b/python/docs/make2.bat
@@ -2,7 +2,7 @@
 
 REM Command file for Sphinx documentation
 
-set PYTHONPATH=..;..\lib\py4j-0.10.8.1-src.zip
+set PYTHONPATH=..;..\lib\py4j-0.10.9-src.zip
 
 
 if "%SPHINXBUILD%" == "" (
diff --git a/python/lib/py4j-0.10.8.1-src.zip b/python/lib/py4j-0.10.8.1-src.zip
deleted file mode 100644
index 1b5dede8f2d627760e571cee6ac08b472eb31d34..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 41255
zcmZs?Q*bXpv+n(mZQHhO+jg>I+qUhj*tTukHdbsW-@8xMKKoRC`(iHUX6m_^>7MT2
z(~2@6pr`-<01`kM+bl_H2(m8spPl`G79s!_;NZz>#b97yX=mwTU_kHSsiq1A0I9AO
z(AYNE&~f#E0RVwM0|5a4=L8jX$a2<uWk^ZcS$SE76KR^sIkkv*1sXakI_cPWjqz!^
zaRzE7Nm=UIIVw7-*zh}|$tS}`(+aVX%y&;+-TQk*a>X(_Sxp3~6c~Ww|CJ>zK_&MY
z5gu!T2>|d}0syH0Ez88l($vny+0@C+)ak#a;fkc{6#xIFUGrHxZ;2=EJ=2(7AON{r
zs&{YHY#OK5N>ePvxtUIoP28U}QbCf2BNHR;1LvFS|NHrTHZfylP)xBgzZKC$3kNmx
zGW*!!<14~AJRvZM@@yHO+KTk>|Nc1HyPFs=Wc1LXV-H2W)?1D8X`w%sPO~LpZWvFO
z(HcVV(3pblic)Jy2+f#y5t@7;MDd^FH#-dPw&Tzu+BJLZ>Q0$|-f8Wrl1@yosa|Ud
z@gIYLC?f+*;)F3B)@pUk$DV0X9hbf9sqacFO`RIxZ0z&qgMDdhn3CWGKr$9vG^X<I
zYsHRSR))o4$@`K%kii<r{G&CjFP-_PPG4V?b^WQ*f)5-!?*R-EfCtq?34N|sL)tbP
zPGV*17Zu(uh})m<)T`9r5E{rU-i?n-duc_s=+a@y(ITrdY^T;{ta1uTPUqAwqs~f_
zIFnTgkfdr;<U3PS8u!c|nsi+QqrVdTrmWo3$&dWmP<`~?a9WEqETyBhA+5dWxLG)*
z0jaKO>c}yD^#0x#GJ9>&t?a07^M5z}f-xlQnbCevuw~mn7VJMW$kuJW=q|?=ME|w8
zVZSc)qLf(952OH_mRKdZ4<o|G;JQP94D=edjSmFnOs3KDPkHXD1~&#N#a>$SJ7zZn
zcm-6_V)@&%YxQ78CSP0@y_&CGEKE$CJpHTmzPmYm$m+_X5YYGccg*eW>VB&ucz0d<
zeDnCY;qKx0<>~Q2pqq<}E88>3`76uttn5gBu^p4kZrh?SlhI!YjC)YIWV5sz3rg6O
zYud)FRxH2f`fSN_rC8elJGTXm9a(*Fi>@}!6Ha{F>i_=PTual>;l~FNS@X~PxmYVu
zu^47z+BdqmN}UEV+y#!o1M3Zy)LkY0Vyt>yGRztDG3@>95mrNdKMVo_Vmzu5Jv@tI
zZ2mJQmkcC}lp$M*r5|5oudk24<IN$rm!D67o9FxX!t~?d@PS}S2EZ%{_gYLfo;b?O
zAz3$Dy5t_>Z}ACjI1^oMIeghY9rfk^=Nkwe8ib_|fdZmACsa3x^3-&<)N*m$n9JBG
z=vCPF?ji#pST~LJ(W(hNVi%I+9}_N4{g7R`m&zg~`dfHccfy40bxdSl>ux8)xzXzO
z5Njt_?Ai*p_}x}u?@?)ZAw~(2%hxo3?{xr2i<NnopXff1rUI-b3O9)%j8P^>s5ySc
z^y<-z?Gx-LUG$R)d>NAD9=o~NWeq+}xK_teoBo)>cnA_&s3$gCc+z!oN=RggzdVe4
zBnh-q90?BfGn^a5u@>x=qzh_r*Np4ja$eln_M<%AaJ^r<x`1sRIzcZ4gfH!#ZXJwK
zhNBu<D|9Eu{z;JNh}-%P{VX~RWBiO!{<b0(E;qJ(z-q;!DSh+^QGv^Ge^FcU5JIAi
znsWx_bfp?z2q^NZrIM)<qMV9hxVoPe!dw;dR{BPD!i98grB`-PT;R{S;kwB7tVzA1
zmrAiw%R)aoRsfX{@aO*fWrxhxKmD}joh0tY*^jZ1^OAc`BAHr0K!c*qnfp`1dqz#)
z3nR1nu>4v4$c3^^)eMK^eyNZQtV?evH;>uXHX!3=_QHpyFZuEFsn%)}`7IpXB^ZRJ
zO=`}vCD2hri^si-qFz*qE}T^jj?^-2lPePh>5+`uPO2{z@0Amp3z0U)Mo}jogp^2u
zDQw-+1C@_mI;TMXn0X8JxbXYEZqN5sk5hfD8fDpuW0;s@Jdlx)1|f0gEUDyQZ7qA7
zdkB>|?sYA%>ItAHhfcn;yz+BK8}Uw@l<~91@gL?Ncv+gt#`UPoXyPY?#H{kH^4|nW
zUy)^O%LH)DY9Pr}lR7#$8VvaM8EmlB(XFVi9A6u#8%jV$+UOdZr0j!+#)j`2d4?7V
z^dF=SP~qgoSd-L{oQBF%IZ)POt6|(?1ir0})xsJc+`)aQq$8*t!5rxFX{$dBi!Ymz
zsnNX^^<It?L#j@RW!P5cJGsy-Ak}<*;TwdPse&S*wQN#MT(=n8b~YLDxHt!nF~+P{
zVEAf!zMy95&yAHr8`GJPEL?N!^&h&Sn3A?)VijoJKjR9Qz!3|B$f%g;=K2aEK?f$K
zDvdyitO9MW^p>VkR&S|=DFnORLm)tGQ~I{q>b0Yfa<M`?Y!<6+jQ8C{;H^f%1JqUO
zB5uBDER&EL@pjrI^Gd-67V(_AQx9Qxb=58|v1t|gVuumcCqgpZ-Q}T{sX|vRLVEw?
zp&B+UR}nM$&?Cf6X_lo?@lcN0xWGNfXKTI2(x*j;1Lq-4jyg!?=~{#<r7@-vu|y|=
z%&Aj!&*`b5N9Fknop{<xV?@!~9vgy^VSz;Yb;o~Cf@WeLE_}mWPC3F75VhWFH&J0!
zzX-T?A<QyhqL})V>5y{c-fb1VXjk!hv436@PU!f<p@{4nAAo45GIltR$HAuZrpl#D
zH9476=oSCuZDL1&H8xQxbw*!n=1gqa^rE5Nqt(NetHsdhwQLTf#$78RQvZthM9cG<
z@+>>Ero(}XlTq&5Y<M~FSeK>Xhq=Xe=_l79Qlqvl;7wbG%b2iG0zC8cMMCc;;Xy;5
zm+6fqHxA|FrW0VqzEh{AU(`T6&^4me{Jhtaa)@^(R(9BQdtLGaTNCmg(pqAj$UP?t
zbsDowfW}+GA?SQy#vh)n;wn(*dstCeY(|+8IeaGxNT&;}!cB!d0%`vZw_LTUsF;4}
zq6pDv{Hs$deE2JAYjF)0SbyY<-BIAM7|R)CvG{Dd@orML>vk-WCg#rPwxx0Vy!eUt
zlt@BP0^_YmD85j!D2E7MX|FH1Gq;av_S(mmh{Y0!mi<f?;n##X18QNwEDXae4^J6P
zi5KCT(PU#vDh+c~$SApm6k5R3Z6}A@YL6`IPb`ExKEo}dK~i!R;E0r!o8ck>Th`~F
zig>e!{e}5Y{^~raKe(rmr~|41rP$Yc5X@LfAu)L(0Ez@x^&{s@9E;ftMkra{b;oYH
z3`5KigZiyZJpkul+f=g_?!W4>ypWaIlb}txR(JGi*}!dhU69SP*?{>d01JI$DxXms
z3p~8IZjdC0JCnrV%bY?F;iL{S_zk<To|I}f3b#w6^TE1`n2~~GE|T;atp5_HW8VXS
zS68uK!KyC1M=WygL7Xdi`ltLQx)5k2-CZ&k2i>E%Fa=@L;JRB`V}}}QXNd-_2~Ha`
z$0<X7B++4x;%Hbg{&(}Jsq@3#zAjZwg|IcNH;T(-*IL387qo3#vJSKYJbMDIg%P{h
z&_R^bycZYA2MC+sR8r)rbXt*tj(g=zqo)|ED36&p;1b^*Elh-E*X$LMtTwR$7Vu?6
zj)Ah|uTw1;q_RTpwMZm_=>8Id&@;L$h5P43eS7}}PPowK9S-8c4jhWOsd_6`4U+h8
zL8xHRIcqtM-Sz>c&LI`L>;zEjcSOdLDR*Xg-B;#8L~YmK)b}J>YT~>sg|5VJAo0U=
zxV8~gn%W&X^u&{(eak>Mp_<kyl?a-R)@tE^laz!(4$WUDl{cZ8)=hx8rh8V8bI*co
z*WQ$kd`C#u0)(NJX@%Y?o9(UP;f(g|V|BpoN#XpoFoIAbivx?vg54T(ZSiZI8|A!_
z#9NgZPcO!AvB#L^Iewq}oaUHXseZ_jH>6{Smonj>TRKNTB4_rVUcb}CXcTgUqz!a%
z!UK6N*-ZYr8W}6v!h|9A%@pfcBh6#uMB1XfrWqKl!q@b>?Hx8`6yClI&sjzjs~B8X
zy|)-&NL!284cp9)XEAbI#fuS_utNadbFQ^?SH8C>&~85p4;N69d8l8pL-bWzJ<{<+
z=iS`vG%`Jy5po#yhQS#|Rv^xElxUgRxj82sK^qERNPD`Ity7P6^7tR$NVA!y`7KW(
zt#q#l$kC?=wgWvE7RIrc#NtE@X67-0H136GlCI^5xsZmMg1FBwh+H4<e!<xu3cJSu
zgnW^^>FdEDGb;AVa)3&Xhsr%7+)Ovc-<TJhuBHTS@2kcTJDgoN7qBbtB)l_FtV}jV
zw!bLR`n{=`?yX1FPoAV_v37x3AVJ6Nz~ArUo{z(5;2Dh~`cmL5+nT!j%lHrE;Dhj7
z+|^0$XeRC3vew8JTzT*&)7*g*X;bX?OuSc;n5&vkj;;vz-1&Ha8wRQO3D_cU0%bOe
zz;%~!+ws3E@j(nh4X$yptoLF3ImSbH-|;=#^$Nq;`t?GO;82UCb#+JMj?1B2nZD9B
zG3ymeqhnI`Khb!EjN}P2fd%nuB1NDHW=Xo2bS6yXep=Ia>0=qFfX$)Ki6RTegi(gu
z5lk3){(9v~O)?Kl9q#|=cA+WpYPUEdwA4Cnip>ztHvxsD3e2kJw4$4;f-NX*t#XCj
zGAoOPyc<3Ss9*^NC43Rm7hDzvvEXT2=Xj(cAt;{GX1UUr`3$b~A*SyW`>SR@+Uh0Z
zTv(WTHwi5MHnU}mwfI2wayo#Qp<VsumtC3{k4EUH)~>Q7k9+fc`vrR4Cu#R5s^zRw
zly-5qqW&bD)z#?Gh!%8dWNVGCbf%7NSc*15OR7ru-L9BObt0yxI|-Jl^YtLvb;pPv
z9$|H9^o{?seB==`nA=-b#H#g&UmKd}_&_rPPkxUiGCL1*$NeA*MM~3M+dlQW@Ro#v
zu7y6Bb3e4b-!KfD1cY+#5Gd$E?5}?_M@5%v&IMfgV4?1f@A|C?_tIm|>N;((7!Qb}
zY1faLBjD@H+#m_IM^|};3OuB;mHC)m=3>YA{*hinaE5Ny)&8(Y&Z4VIqZ_^p#)}R`
zH2F`Rlakb;ox3FK<jk=feycN=+F@wEBOCJgn=O1)JhH~YOEg9ov7riVj8;mbV(M%u
z=I$W!C8oJdlE$^I1V2|PMHzN8^N^2xN{zMCD`bZ|xS;p&3h(kT9mZh~1Kx>}bTWF$
zzeZt=`&Y#8Ornn?D0WwPV{_H++Dc+bx3M?YS%R2Rk?gg?EucwOO%Fa;#_#O0<sa^X
z@L!Ac57mEx-=RpID$F|HeJtB1>8j!nnS|jzt$*8t*G6B4FVa1#c1;J}vw}m|gvzj=
z!P`Umb`csrqq}Xit2>JIt(Y0iwSu*5TrN7dj}>oUuuQGBywu1~{?vc5SoaAzEs>QL
zwd+}Z=N^7VeLT`=vxP^YPz;P*F4S9brJt9U3r2kdAm~zd%+uBFw-q&uDjTnR06l?N
zCh2u$#&lyi@dm^T`18TRWHmy^W}b;{IvDlz<6JTFGJP-P!J+JYcyGmj+LzB_hGKKs
zGlZi!bOS1LLo3Wy6O87@^H<%}_f%`h^$p@YdyZlIh`|VaTjIZ;EY#5w#h<oqGtoVl
zq-js@8Z3I0D4{9RI5uU2MRA?5Bw`3s2njX|@yWuH7DY4?dW1|4(s{Ty<8m|J7rqKb
z$bSm`XdXHEz5eFrD%7<|F!y$H{z!Pa`f~AkUE(K9r8~YjK;s-%;P5ftgs5~KD@oor
zTr-#GiC8x3T#q5RwyM`keZ}Oj_!lGWWDThVa$~*_X`gW<k)E?Wqzo~Fi^{~oIi>DQ
zk>$SNE;k7kCy#ZuuOV434_Pg9+vI9ze=z5kC}XwdI(H+f7ngGn05kC$yuK-G!e6PY
znjb0*q}V^fe|!~iL)0RW<YK-^MX3oLe9+S1`rTc&{eI3ptZCfM8OoLg6Z=S;H;qZ$
z*7YgGGwk;AaSKSz#qY{XI2xVj1uX4~$*Hp=ERrrbw#g8j${QB$7HIMR{4I<t$^??W
z&>B<(rb@d{ZV5T(=Z})P+;;^BZcseygw<N7S>sQCFa=*_A2UIkyg8>WRrIDM@ZYdB
z=APIxe}%+tpxN}EmQ}fwl)vpjtUTQ>q;Vdcjg0#8DuYsMHt*jg;Bk!jn@8+%It`KU
zP2SZ=9;9_FBenGn9!E`GI^eQGy8<G#$DVFRb|VzK`&X5p_t3}<8QRg^SBQC@C=$Q5
z2vYs|Kq>{krz+G4{|gj+wX=-4VSLXX7m+`|O`&l{d?&l;@*=#50FR15WL$}^AJS&?
zWU3F&cy(yawQJ=F#!>lV`kbIsJeE6CW#$Zr*S-bJrX!o#hi}M($UJ-Nk0G~Hd=_+q
z5GPc;>Zka)stq4xYU+Ex3z4M0`Xr}IVW8_wU|TS<=tAdapBagseRWTuIv&QwYh=-h
zfx17w*Q*?Cul;(H=Yg4AISRUcY5rA9lr6G3#O`7bw~2gTPHfa=bTW+N@&g`AbGjJV
z2x0#+t~JAb5?0iG&^;_}dhys_{Q)gmP)rJw)F;xgZzm)ZPd&n&sS*Ar<B5{C)KarZ
zUSOZ_hiWZ3G%JmksIUev2gV1k$*})KtWV}-MP`#D!sEoGjf<0$5mPV6Tz$DY1ve^~
zumBV&Izw~UnkrSBRN-C6#B{<P0XhBN53cTxp^|+h5Y-Bx_@GJda7O)z+X|a}r7^Xd
zCaffP3ykJSf4MF4YVtcxFREj!9Xpb{<w4c=UbP+@ynkX}VO_l*$|2k&;190M!#q;p
z!EdI-e8bBY=j`I;n~>Hwj=jN%ATQ;Dk>KBvAyB?tJ%0Y4xxIh&_3(N!;l0TRD`dm&
zm#QY_gsOgUl$zNDRCkJED>oP{d)MPtGSuJ;9V|3-V#{{gj*6OCT#cvMdG9yes^l~f
zw++E^H+I22&$_~qb8jS{v?V@KRB&z(yKD`^57+<g;_CfSAi}4)xv}&0`Yj3{y<OYl
z_eD@Ky~iXHA1x^^*mH(D67CWZfcTfv*WE<7@Q)71D>Gc}BJ8HWmRtwiv?BbRg}xE?
zHO!_JT^xREF>-#yz~glWnScKb!f=y|ga&uU1S<esG&I?a)lK?QDlxu2$qThEZ=+A=
zm~8t(`JAuJmfQ8@<IBnfWa(auAH5pNfknJ})hl$;nu<*<?0keiKj>*%>z^8~w^>|y
z+j5tKc5+VlyNEv0+J}Lc;U3CAYt4{=uJxtc7(D8P)br^#oRUMcpB7p~^D>=XgNiM{
zAxAipT4AHa?}GEUr9L5VJF4PDX^ux6bi1fUuUW9;V0ujFklZolLldYcOSrbO$tLqz
z(dUpdgJl!`>T5nhAnQ4i<!>kR4$MNW@AZE&qAvW+9Ao3LFy=Ov?HuYg48<inXGqi1
zpXbZ;IH!pK{Y*ICm0hn>T(QFYHT=`$)%W6`0`K@4{J!7)q>}sQ8D@SGT#v<=+uErL
z-(kze1Xgt89eD5W3s%ayxzSQL8Xm#jV8x`~VoW4+5^EWr#rw}b&nbJ)oF~>`-slT?
z_AuQQHE*BPFE{v&H1FgQcgU~oPKGN)!5Q2(y$)iV1$V%l@JS1PUtfo86BcZX6Z=S8
z!X!70I*U)wUgH=xh^yRGyzL4fhr4zieC^*C^HxiXN^t;L(<;`9&^pgjYx`cGFKzsU
z1UeI&KJgUVD$uqgtLsJ+;uhNv^wb^4K$u&JW*P8C*=iIc!&>(0j=u6F-i5rxXzF5w
z=Qwd%7ovY-rsfxw%hMN+PqO{8RMiqngOSp&*>Y?*&Gm=|%!gU3ddpURlG1JKoul+u
z1P=m9)k#%ogSKF+*4FA@;qOStmq*DN^Ng>=z>)qd@hs&M?}C}0`}M)}n+E?s7Lj~q
zdkJKC>9Ox=4D1VN|A>mX39iZdL`EOu7W59plX|`b33@uPD(EkG4sqeg9>g_{$jH}S
zX0Ih4tm$?08*#hhbMPIf(E1r$HYT+XriOK%($UD=p$kwmQ3Rv5wD8`n`X$E{{w_B=
zwfGh0GtDWE30VA^d-)ya=~PlRKHc8QgKq&50=&0c;5w%n01M?JmQdeLIJ_n|J4BEG
zJUT47YV6@!?I$aiqtcZ{DrKL+Jlc$Mk(rOv-y6j44&#+dF}3Wh9LX|Rm!+Lpk#rCw
z#@`=*--S|V;l<xe&<{ufh2zxDR<!hywo-=!GQQ(d5@*51?TJKDczD<%|FnJ_xtWV@
z;4+fz1ugrru1rGSUZh#?BFp>g4^7|4^txc;v+uvh!q7)+TPe81dsD`*bkWyh2cmn<
zW$^hfub<jFbT?oWK8X|h;pN0S-~2+j^G14=?qJ>tt1$OCCU_8k9NhQqy?z2_P1iqc
zUprt`m^YxtXJKG-oo<W`s4jg?K6vCA3DVEwm><bX2J-mx%PJn*&}gO0v*i$i1r7{f
zAG5tvf&H$RYop2d1Uvr-3aFb+;`8b@dhZ`5f>jkuAvAU-?s%zPu4V|1@ZW+)`e#bx
z_E~+qb=cJV#6P=qh%kt3QkZRvi%X_LAkwovBed}n04+qF-swj33<dv!{13306Qh!Y
zQ-bq40|o%Fpa1~){|hYaZ5@nU{u@=O3#RJ)FBGLkUB+&U5pm}d6~vSTq85Fkl<}XK
zVzC}7rFOn|V3IhKN@R)@bt3&L%#$%*(ua0!GHN*n`upij_z*>W9bqyT1_F<T?EQ@|
zVwlmLk~$j;?+F<P9%}43Y`U-r5gKa{2ClQ@eSa<|5LK4BD9|3aa{{p4A$rF60lVE1
z;d(=uVoPurvd&wz-!C$RmO8ym!5=UwW!6%q4ATB8ZE6z~MX?M;JJsrC9m9$=QxnGf
z7{c>R-E(Xf78}<xUs-3x@vF?R1OZ$`P=#_<Jko>0g!(r4d)&f)u<5{d<9L@r1cxq|
z0DdlXK{Yk*Ne=u5f>0L#kmr?Lw&AQ=`n%N$Nj;lVS7}>A)06DTFvud3c3%Ju;=_1%
zO^VAyk3TIe)lzxv;VkbKt)%Z~BvW-76L+vI)#}bV=8%~aU-@n_I*r7n`IcQucnmK{
zXYa)>;)N?od#N{v>>tcT_6Sz{_S(PD_6%?S#Bv9g^zKpVO|i>b^3hlmp^=L%U8-CI
zWq))wu$;K>BM!Qp1J|NIy@TPn_j@ZZTz9-l-FU*gz9YQ-0+vHH$~%uYMT?sEim#>C
z)H_o4_|qO-=R&P2;?0M-D@~NoAZt)76qY6T%8+vwuC&C7a#aFYHl`3VXE_l#ev4Hk
ze2SC}qV5DjvwM}B;CF$c1;k!1$>o%Lm$&;s&)NKN+4wEqIhP)EKgTj(c5dGTEzQcu
z{WRy<`12big?E^EVK|m_=AG?{Gx-<S3r;elN@&9k@0pj6Tc7)wpL%8MHV|{7uF_Qv
z)tPGs=5z!~h=0#fmU49AP3*zWFh22jQcHYRkG}L2B{%yUVXZ;9hN7+n@-hntB!Q8(
zheeVuI+KR6lSmge(RJF%+4bh0TOPCb3<>NJ3`+Wu*ebIrVX_bq*Bax55f;f1Xl?Xj
z4eGlzqJ9`?g_m6tkmluLsk7W;!T_nAkz<nz*6`xyi@^##vWDMB4T^n`KmQNk`!DRM
z@=+AB1OfmoA_4$N|NUIdEbWYJEWQ2*?)?{c#Q$I5p2kk}mN=sSngNq8ffa^MG1)vZ
z=egdL5TPYe4}a>Rjtq&lbCq60SJY9+*X`f$1hS2XosP9|;X(>;*1W#=8Q2f-v^_c^
z4Y2fV*q`nmt)FAkiIEYp2XV9-!FseNMZ*a)Y(s}3GL$b8t8p+EaMmnwsiC!EZECq+
zj&055W5f(vNtn!H0jUiv@(Bj)%n{&}5DwX)c$l|?kynN!Qj?mRl-d0(OPzv#FNnTI
zIO)+?g^K0Mu$4+1+bYKvV0{j09SgEDWCt=%Ftb70fSOq3nh~{m$)J)N%6gg>7N1Y;
zlh=FGChvFp8MPh7j(fI_D!C-mf<+zsZwMO4_yO6PJtMm_E#N1`FbxNe0yl_lp*AF|
z8PVYyHqZby?{35BtKF^z%|h`^l-$1{AS}7Y(ypE@fs<)lQN>&+#xoC}o>?9QZD?Y!
zf$i~Xt-!5l(t|lM8O>&>InND(smq*@sh;Ni;<K*1(50X2<gSi)At&orx*364_S7)Q
zd&w)<z8|0v3>9LScs|zPxfK!KHni35@<ZM%)i<gp3ml>XU;Y2Y_XppjB6UOid-tv9
zCN2Yyt%RUUm1`&ksb!4nfzbcqi^xrGW22hFx;HKbn#qVT_;3d8%|J<4L3tn!b4mFA
z@jSt+2-CFgNUAt$$AIJZhsxvk&TQ*~&G|bMygS!B#<sYQsxKG}T}`R(4bikc1Hypb
zn}e2$vVo2D$*$ZV!-;4?HbIYmnOkj?wL&`Ov*&q$y~|M@UK!!bEy&3Cv}}s{g*UXb
zz=B8K{fh@lF8eRw!rCc5`33Q3B)Si(oqiK!+_HMbn-}~Rk-3ZwJjSOY=d<SYvC$4N
zCp<?tXFKvZ2@_s}7-aTG)j+5#QJ4=Uy;*5yn(2r`8yW#=^4eji$QYPbNIKcuQ-lXY
z!BBL<-Imk_d>}qy<8x|XO=_t`r(4GZBG}Rp`*mgLHY8=&DL7gwRXS=1Q&gMGZUm*E
z!S3oN!3v~|M8feRt_6<uKNVhW#1<s7K>MLZtq8%)*H!JElj7(3jObi9dF-zu7}Sr6
zhy>-D-ek@q?3fMv3%fvT6jb7Z%}rY7(P;Cks1rm;RQTh7v*%A@4sXIlzJWpf@$b20
z+~;Ks!ko#|Wm($f6JHrJtXxLF>3m%9hDA$(+JD+vlMFx9ehX!{m%iRty(4&B5AH9%
z*~33iXx_&`cm{R;?j5;qE%+Up6z7a1A!YUbC3&8crb~G+&48XJ`3iqDqaTFvBd8{b
zBhTRr0hJwRc1!L-S;NHKFg}M9%dFr3Cp!FBGL_On*U*6bk3~To0Kok3=wM~!W@KPu
zZ)0O>;$mrU=luVs)&DI>=-S$EaU%cR7z{Y~N$RXgc77fp+Mth{?Q}gQu}18Gq3;Nn
z*mA4W$nY!4AJV+^?M>f_73<?t$Zydqf{58VnQ`V|@#heg@R~6Ac?^(mu*%oLuWv0>
zVf+rK$+vA?=a??35o^*PGs~FrhTS&NLL5G6AA>R0uYF|=-I5|D6VEEMOSPv6Hxip*
zTpGvGwKf&;eA=+alj%?%+|lV6b11BPsMV7msW6!yv1LbL8IXzfEnVQA1!2rXKj=TE
z^!RL;FucCJa{mSC4N`gld7+t$?vNZdp7A@<AU<g*rkZ}R87Q=9rB?5VAV&YM&wsrd
z_E3sg%5dnoC$V=-ylGlBG!j)*+#zjCdKlX|<6BM*H2e$a(G=Nw!C2T~A5EgX=-ksf
zc)AZIG%4L+96I@$a(uJy?^A=o1iFm=8%aUfojnkD=^^^tneId3_yBkVZXL|PV@}-)
zV}oweivO^i)-fOZDRN&xTmj3hgoBO?7<tcuH4Ap<!^|7WbgRVP+^{jN;9x3cVJM(N
zWyLQ>pVk!X#WWWUD<j&mqL*iAgWuj5<1$Yl;emI_q<<TO_pi9jU9r^!7$;q9t!E=}
z*Xbd3nqf4hd^!zj86hZ~RL1p;dgvOS@SEa(;seF`4e?a4Q|kc`RDdCnx%8;qJ~?50
zO#Bn(yslW+8vH=ybWw*Pf$lqjVF9U>8wd?hXR|0xb+a0`DOU(}HKu$JKWwy3MTp?E
zuOaaDkliUUnUMM*sJc+rfU#7aXl=F_0SJCc|72iB{8D>Fp<vXXRTHqeg<>X7rBA*r
z(gw+yr+byn;y$)EQ}IE+5G3ppAEI@l%ft9uGLKPlESxdMBhuW+2`}Jif$V{10FW-r
z5ib+J3=YW=GsYTYF3Gi3;kwugme&g=thB#*6m_xWMiS<8HF6LYR7rh&#hp0^7dT|x
z8&I}D#tWxfqR~@=HWCgNIrRstvL>^%11VrS1(+yzz+ZM-9`t}{iac3u7I>lW-)nXE
zqfL9(Gz7rKFgCCN4yZ2!gF6lvGt}6|>L@thSlwAG3yl+FrCK$mL`<6Eveq5#=XL)P
znG-m^fk@bEdET1HnOL5!c|<R<&KD04qpb=%1vsTZnzd3=Wa_;XOrJP1r5my|)_^w$
zh!RRy@VPCTef*uSv?lD#8p1w6bZki}sb9aM1&}gRyqO%h6qOMKIqDp!SQRAt`Hcet
zq8RvhvPbmMy`TiAb5aQ8#?386!FZ?JPs%J{hrX-2=@s*uFb>r?T#wknOOGg0l0#U+
zVDuLyaEuvk0=y-X3M2YTdzrNtrTn8Gh)V?V1+M>1WJq1(jx$v!V>;nSqWUt8=0=eu
z2+lK-6+hdq>P~4882(i$|I_of4&=r2x6Y2>VH}JB_!yo)w7qL6{gUzNoN40=6kb@h
z(J}#>y1H`x`wOf1S1syB*yUNk%n=(==bQrAVm=HkszYn)FofP*ZLrR}S~_C#MGQ+U
zxG8#pV8MZLP!Gi-*!^~hp6E&Yg(x<W5NmaZ_LAXEqJCoTN`f4@pNts3OB)t4LY(mO
z$%3^$sh@HSng#u9Rdb+hNxVAX#V!@S5i*xUSw$gMpKeebeC4DN3AUUE2MH7nuJW8I
z_9WyC=baT%mCMzch2$+hYg?|<vMeZ7t}CN!AkaCx^&XvV+-~Lh3X^9Yjrtb6o6fP;
zs))R}O76BH&~CIJbY-<u)ZKO!sWJ_e%CH*_3fMk&Xo&mD@*Z7?H-Tqxaz2<#C3RIW
zPB8WyZ5(cd*snln!3mGCU>}D0MRimqaKb@iB)l+x@DFW$d%Sr=KuH`MU+J`ANtLmk
zvRgVni5K&k;soc)a)tu`0Z_1WE27Mz42$t-Ob)t91#DOYg%4S(G9zL&q*Xd#Y$EF(
z>cne`SAST{2hJ2DMZIHxQa9|0d`ckkz!Q2^lL3tMX;a&P0J{gQUULyjj2o@Z&TpuU
zK%<fKq><BS#HghU1|y9i|8J}khr{doTsS456+mhYYjExs0GfgNjlYo>Pga6NS*8YT
zdDl%}uG*Pu3lc)r%p)X2*?n2riHoBN&IA#dW+LmFEeA;@WYU@vl?_?uVWFO8?;@QA
zTf^S~(562Blt@pP@E0AkpPh}H^^cy7p9Q~q&&yq<-{}J&h1vV>NZLPcQx>%cLP7~M
zn!V07>!=*P8(rg)%`a74er;3o1r)?WV@RZv$V{ksk~mpI3ebNl_aka+=Kt0Osyri7
zC^BzJbN4|<{6SWjc7t3ECs=e`wi_kL&BaB)ANzttrM_a6QS<s}=$sH5es_173$ubi
zcc(Q9rsM>)RQ|_smI|l56@>);{9M;hthOEjZKafUAY{$^HZUx4ZX&A|yFPT(sf&Q$
z?Ya~uNV1O43aEe^pSA+p591+%rK%;oyjidVgKDKyw94ks0&?lU=84+J$yF6q(wq@S
zR%RGHQnQaupz`{M3hicVUU;lG+10=PW?&fk#W;#c&h(?Ic3Kg~6{M`WUawgX&^V3G
zrncVdD&YS0Lgf8gMbsr(MYksOoph*QDVRzt4?wQ3w2;-7lqa#lZ1s{AN1Yec4}^|K
z<a9O{9Z@0aSS3YcxY$UH@kbna7P57uX@A4xLepQSyjwZ}!*r^U$1aYkTVz%XiwSe8
zuw$VRgmD@`1~ZSYV6HWFLeNtjz^~8&+Tb2fJY88V#|j{h6}TzX5GDW&qf>07bh@!w
zYuY3YFhGsOmfYF}xNvj&y9j!D^&hB~m+hCGI=82Buk_?R-X7mL;!13<JaWE3y9qi@
zHAwA98`_w_Rn{6tT<-h~#NUdzp$t)?bf%bj{PyT(52lHl|7*Q@zBR9Oz_k)$+QqLS
z4JD9e1h*bo=S1khPNo{ntuPd-r7ooS#}7ea-4X=s#g=)bMiVzwwxqd(DWn#<RYtXx
znyX4SXJ)8hIQ#HME$=BjIPS3b;}vjIbnD9cnyQeCuj|p3l~H?J@;Y16cFEEW-YB(-
zG0m1rGc~||()N6%YcC%A?#Z4l)k(0`Bo~n2H&lwdi_B+IcZiGWn0)$?U;}LkU<9*c
za5e4&<d)abfeB_(ywBF@wQ29%R3cPE;aG#FOYe<)rPn?Swa{kdrl}FgZ#|C?ZM~{+
znQmjQj`mu2Bk`L9MMjNBNA>vGwPGC7W>x1n4+(W4WA5kk9YS7C@(QqKVZw^%LWCq~
zhlYop?(BYwhr*e6b&{4#8L$bf9oTE9^XvOgrpruw-0&c&DpNIVv`YI7{SApd)C+9r
z{M$-&v{b$3AdAy}l`cZl<NDslIbA2F%dZxf5{#Q>ZTjnb$w+s(P&?cO?j;3s-8N~z
z$-xd|U%A{$?|$DTHc>Ufp3<{vnJr`RWux(Pcy>U>=qMnJ6_aNMvLxG^J`{U$_Z@$w
zpb%C>Cv8I91o5&rciES>CNS&yy{3tpO#t(W9A;lmcGslm7N4iB4+kR&wL?GPmq2oD
zp|B9y0->O|)RoIagw4>BBVQ9xogw1&!8z{nb^7&Y(G0^{7_3wf=4TK=?l&o!ZqM9A
z;2fpA;L);1jx=1A1T(ZrxP#WFN!InYbIs-L-C0#+jgzHZ!PT2v9?hY(?A}ml<8U7*
zZ>bkUp)e|j@ESwV-}%SZ?@%7C1x`WepGe4OYWlO>;mu^*n7C&PrK1TTwJLWFtCOOk
z$AZ>Tm}yW5C&nj*G*Cn_O4Dp=aWVW%OS^$Zn=uWRhdoOJPo)WC*G!*nK}5llBTuTG
z;xZfmlC+bf=|iMygPKnJ&3#LEFIV8hOC9>hmr%pZ^ZL8o`$l(&BfTr@{{C)O9(Ipv
zs>GvzTN~um>F#sisa>VyqvW*n;NrpDCSg)7j^Sq3bHEOCCvI^DXjf-Z#?7)Io^vL-
ztrBguh=g2m4#^<LqfHNf{xY2_yI?oxh@H+=WGzN=SIWV%Yz5Ro2Rl*?vQiRU%uze5
z!qUVcuC9|brdFY4c<U};Cu{7{w$2~o#dNp9(oemcx=*Gq$mZT_S`l^Z^m>EPc`uQy
z_@u(KW7S5RK9VK>=y{soPv0Y755swk?J+@G=M%fpNyJ}mSn79AcebEw%Wp+iK2aUG
zN96<$CezxPv0!l&0eDD3DzvmFa|2P!`LO55?!{*`?##FlU&`JuTywTbtx)|06N8c^
zZ7f*92V*<21YZRf`bVmm(W}<7x_>if`tx)(TF*Io7mf~-v#v0k%Q!FID{9_fY^P)u
zw58tBz}Vm7X}$~YJi7I*0n}Ygu`opQ0|FH3VDzt2h$?%`?AwEqy6H}KLZsC71i8jm
z)6mUOY`irJdA;R@@T*>!K?10pZUgs8`{?1@Xff;{_)u_!`+Q^_HI4GeiiI!Xwaezu
zx!`^vX@EIR!C9MJ2){fbe%p^6h?;ph>A}@GG4{73(|JP(!>G#c&vYzvxF=7?_d<?C
zS0L3+$L)Btz8k(aAFcad!^)N-4}!eC(~h&nwv`+H+aK0;UhJNZ?$`5|czqk?gWvz@
zdYTrh%Beg*2X(6h09!6V|6|ksKZTmPk&CIjk>`IGY4mqfI{!<NcBO0kUpDQZSpy~+
zgOwn7(5bj4_bF=M1hP$~!nRaO9<DkVTx1bsbVx=3WoGun&&y@HCx|4qbdJo;5EoVC
ze_YyLUY*{#H@OQa4X-W=vQeZn=Dv<BK0U7nZ^cX3ZgWw8QmK?Mh-a+TCZ*_Zmb~br
z?j_hNXgsKk-{g0t?%C-sQ>TYBbEcPcBQ3RPL#}jdFaJZg_r(9s>S&cZ<y1&7J8MOY
zac8d?Crwz*k}Nxu^?G}gS8AtuDP~5i<(4;Qp_cpuWxeXq1*{#GkKd~5mocg;M|78q
zRgjMu7d(Jw?nxC>d&nA^#NeTJPcQ1E)_C_HDOLQbK?YKdUlH}ei^BN)^uKlaZ~|+h
z{yaoW_**4Qw|b~Kt4hBFU9ws*F?R2wRws=%Myy8*V`!LKMrev=PBqC~B}_D?mRMtj
z<{S<GnuDqZei*u;T54VBr(F^ElZ4iUHNELWM}5<(OEEI+q6sWeVyQ$`Y~F{KtaH-4
zass`J6l%q$pEe?@=C(a$T=Af&o2v_N1F<#EueBA=b{>H}H9?0`eMTs}Oe111i6HFg
z3N4!<|0=zkX0^$*<nhiF)S?-reGV=p(ch<h%7PZvQT<0Rok5r>Efx4B&<7Gb01x)v
zzj|5vnb&KHHC^k2L5&@&BTeObv)gH=263v8-jFS%`mIS~q1XD(qr>lgC!aQ&xj4SO
zXm`z7Z4cvwPN3l$K`2|%lL9$?+nY<pHAb8EkyJmZGW4lfHDArshoUL<(5)n^1g%_E
zxBA}+^k)ED${iu<3h>8Bq;L;RXeGH*#i8y_F#Kt!yY`5R))cM!G^Nj4p8ESL{5WV!
z1V%n6@)~G5%!;OORyo=1VE4mXzO*@^WtH~Y(~36&uZG3IhcE6WgyZ&w%e*X&WA9|6
zy%26DHHa-(;?M3SigRQZbc^IAz{L+c35OEp8~&FrP_`@?<gdpp-iT1JO`Qqk5U;`k
z3GXGb|BgOs@-XWol><g$bCLZM5-L${AV){~rCr^Ur|aX#^FDPi|NAY_lUK*@YoBWz
zL>+>?eE9vr{r$@R2R6QYUf<iC1;LN^Zi_twLv7gIzYz`t|L+#>x1+nQ=*PE*e}~}&
z-{=f@3q?f(Cye;HIdJrL`9l{63;SEqOGg7}aYZ^EN^$TC3I=%iz61scwY$K1GdJ@m
z*|R6{41LjceF7aZ@$~Wv5E4Vl(ay$Rm(lcX1o}JsKK?hgYAbMJ?y&+^A4;{VuPn4K
zCDNMi%Lh5WhPo>wPEdAlf>p4zNSs<fu(fm$5Wp`WIElM9K_r=`=fPfwSrJeYu+2Ia
zUi<rdC0L4@bm<#e>^TeLEx55EwB|JH!u>afYm(0EVh@qXBdQrdaW>Y3?4ZE?azXs1
z#7?D|r-O<S7V+!cN{p*}u7#qvzq_In-aZ4YYW!5{kdoAYGAefFmSkOTOtp{PSVAcl
zQO4A<6vx6#1sHFx+U7o{)|TmxTe<ES<F!P{U%^n`m1$f`U(aO!Kxt0Pn)OyDAM*b;
zZj3Y8w1+SR<_LP}tXv^vXeq7q-7Y6US6=6FR>0exfCfT_p+GM5v*f0ZSYhq$4(+Su
zyStyWj?)0Pxr_Ue2sg)(jhJl7$gvkq_JNHCal_c`Bgv=HEaA?mi1|Z>z#v6H(yiDs
zNl?4_J)NESj}alfK$g>vT?A2F){(K|FpcDZiX=Kh?hYC1g-%`94v=y|t|J+9KZ?ok
zS#Mt&L`wdhRO|Ycwo+Fuuj;dsI4CX2&QhPB?|ZtquyNwTA3hH7gn&x`s1G?bFgVil
zc(B=t6PBBsyJ>(ks-#B(Tm@6q440#Vz$yM&g9R0&3Adq+Byr?Yff?Ri@XrYOQ`0TM
zd=io}=@|@b1ZQ6Y9pWvy7yPj!9#m5=n}T%>YYxT=Kuh6D#GwH?9lOU%s#5F@OC_vf
zk{1*|^)e--F9sko$$s73{aN?^B4+y&a2b>raMItJum1J%amd=+%lYBQ2)jeVv7*R(
z>p2*<hirze&`dY%6yz<hTLuHROJ?C`J7M6CzAbq!Zs8M_W7fm9A{tmJHxD0lL*gOm
z@xr@#8y34k+*1l<Fo|3o-o;F)3ys=n49kD!n4jdui|x<OwfyjO8hVKd9f4(qA|}WQ
zq8-Z*e%KpKFG~&bI13MJEP&tL>Dk0If-LL>Z4Iv<456s~qdmeGB(H(K0;_a4_nEh_
z$C=i@N17kFFCpYC(?WT(4wqSec6OH6^=ku502HA$?lq<BrK3BD5rY%CI=+L$c$sJ5
zgYM<FP{d^|nYnSiP8nnXB*iPL?kygt-fu_#B#C>81z2!zoHmLC{x>EgcaI7YH89pC
z=96CesVET#P?I*Nq<D(q$RRtY1}y>ZCmKs$QI=b@ghs}y8yR;cC<w`iKmz&zzJSTG
z17l5jvG^vl>SjE81+#NJ#Zd;xLi2!|&2bi|@?NpbK<Jq6!A+rNv<QkdQ5HvvS8>l2
zszls-k9Tq;w{}K}w>3xPzHHyY5#*aJcqElTLc!DSDQ)|R)wrSqXDMxx1kv4GL)p+s
zV3?bSpN-wWA8ZKvoam2Im}7>Gu5r|9+vX5=04;%jxcz3jA861VA$rJI;c#K|?gPrR
zmFN-b)>He%ofL>Xow<>O5tG%Ro^Es72%L<;K3kmVfK*_0Kt&w&kE8rQT(FOuSLKa<
zqXj0J^(Vb<4$_P=iLI#01N9BoD*<JxQGLeB(A<TV=3pDHOC=`W!yvZ;^^=2`DiPuH
zB!&ie&`7+O*h9}wG}Dj5z{`<aJ1>3>1hF_$fM-IvrN0Jzfn?<zI0NL-dqweLSz8QJ
z+J%lC-^<+qhvdY1<3iY>7ekJyLh~;HrpV48Sl6*Qd(3>TZ(tZ45o~>Y;{mtLQD{)>
ze4b$?mFgZk=Fh*wN;9Sv5#hE;7BT0Vxz&@%tR8i$nMtWuFJEBlRct;qE3Dt3u)Mq<
zC=IgdF*imS#A<#)7I$W_OkdKxzdFso%*&I0W1+yQdsMiW`x}+ic?F~9St$T-00|*o
z=-zsTy#Z96fSZ3>iHRZ1P!2&Z*}lMQ^Imm^mH-U~-P$f<RCNuLbapF;<=;b^Hd%Lj
z1oq(vjoFXAxxzj3SvBn{0pr;|p%VYlRDtq6QN$&_V`ifzGJV`ELtTZTa)o?gwSc^y
znEJEE``Ft-Tg<EpHU3PT2C#NAysIO6okcIAtYW<_KM^XC07bm+-}*qRdxNA(OcbNM
z#jHfK(n=w5s~~SZ;Eb1m)>K>H@32bRj)MQp3wH5-r$^0v1{-y3$_Kgxc#g!6f5&0!
zu(ZWSOg8von9x-N#Tm(PlQ{xyWAzwC#<nX%9(QP45UZlt((q6<G%JNbSvwTG)>E5L
z#&!0h5Mve&YLf2Gv=r5?lxY(>1>BSz`h-U!O4Y7Mf`vOK$shx)3ffWgDO}AFmAGe9
z;MlfcBMn(Q83vYtuQqUEy>JhZ(?ub$X^AqN>)2vQZtm%_<x5Lz6De?csK6hRiR=E;
z2I9*N@RT%cXl`I9x2nfT524f%TRe6uMF$h+P7@=QFE!SyHVlA9suJ_nYzkp>5!0ff
zqrE|^+Q7|st}&FWcdylyu9km;EcThBVbRz?<~=}F6)97_uNa-TQ1T7qRu!jho^TWf
zQxmsFb|8TI!_LrdsolZ2Q}~|6tlObD)uC-zv(h|rCn`b_yt@(h3Ga5HHoA!08{5q3
z97;_y#ny>>O?N&2+Q>{#AVMGV;hA`AButZNI+zbPM5pO-UaMk1jG*oWhXoa9#I4p&
z6P>taERlg+5CB?Q$0;Cu#TM40THe2tQS2(uEBBc?d>_!PlVqH?DlswHI6RU8t=Opi
z?+eaIv%3Dkz7T~q%$bIbK|Nm#F|m`4M9RT!!3~qk9Z`9lk6AXRFn2EdOyQGqAL*l4
z>72Y+uWjZfkw}O^I!wFQB5x(yM4NafYPANrjS9MxC^c>C24`8HGyQay*${@A?P-H}
zJ~BNw8D`5=yQLfc$}qBXD9ePDp3&Mfo`_k8RkQV4A=@38Ni~z+n)X2YKG`*Oky~c+
z^l$_6I0zTglGaQmhZ6c4u4QS5%kmE2fY<VA)c>$JcE|*6o$X=<FHJSpO^cnTJ^-y9
z$M>qB(Gp{8X`_5*GGF~tmIX;G-`u`}&@@X4fk|m(!g4a@L$+;1F){!Q=N}p5R#hty
z*LKpKBkmu#_A%5p>H!e>M&KR_mIQTf2=F_^-Gy9u4@0CVCae0Uipc76e8ClV-6Pq5
zV0O;9#^S0bJF?2FX#pc)7*QLkmZ{J!45F6C{2v0nJU-r-{@#x49_;eYz}@ZgHw{B!
zDgU_%l7br5VI?g^BX8vwGVX$8hhtPA|9WY5w7!nY=(`(1rc~dbVRT&Y2w@!p_B8p}
zC5f~L*nn7l5VGSyXf+cN>jtowDLORGs7XccIC6_Rr(fc7FD6HQ6MtJyNeoV`w0N*7
zRQ|?UlvA`T&0gp@@sFDr1>p)xH9pCQa?`a2K;-MGJ3<a?cz{=B0a|I72G{IGv13i|
z_RVD;It>Mf(hR@y4}+Z9x&=~4^s@T<<pUZWfg!MWDs!s-lm@}NmEB8g)um4cQ@{Vm
z28J%f-Cxxq=s*0-jw!<Om-W6GmEq+dS{gU)_yd*9Uu0t7VQre{V2Ng94Wz{tyuU^4
zccUdcZ$Qm9rbXI2F{z$-2Mf4sX8=8xkPu5KLz~in`POn(mp+o*u6kJuX=Jl0y7d-Y
zB2N&d(V-BH0us%ck9H_Wt9W=zy=j)GMr?_wQNHLSfr$b;>LKN(%hqSb*B7<_nO^{e
zRl_;Eu?s<jeLnt}XEU;-c!gD{Kr~aUynw>pRl33U&<QsYS1u=&{E6^lDw}O?XTA2O
zK~FfywLKfFAfSvhV+A9~#_}WDeH<XGnbSSE@M9Eyyia?4;_a4Wy}lM|;tLo!LSAvR
z&!ksdGyCxc@e2Uu*cvk18KsrWuB4|Tyk(n&i^Q%o3yc#*(!>TxvYUaPFr-d^ZIvvx
z58EVFytFsN{G&r9mE%dNF=KVt+y*#=KTt=O;$YkEBf?iN_LQC0FwM8QYJM9fG&83@
z7@A0&1;_8ZW=*{-Vhl?3(9tdy%*hos-Rh*+1XCR1x4Lc$bK(!z$emt-<+itguOux4
zK~19Y9qzZD>maWxc2FI-1?v^EgSy+D*?B+X+B9pE&+~dLcYyLi_KG?p6KGGtQt}8A
zEv2)%=P^|krJJ)nK0H;-C9JHPz$T%mkmO&d9$B23!6FWz5VX_Az~9M+Q^E0BhEAbf
zn#iWWBf^7nrb?1{g|w>{I-)y+#cO+~&YuPR?P5u*8=`aqFtx2n-z5ZFu<y;zeg5n|
zB7~J$bAR>#%~TBzU^Fq1N)}{n^9LJxkM17@EmBByy4TB03Fo0CNAs-1VtOIbiOhTV
z8xc~-rS6Ulqw_e$)Y}TgCCa3Q-~m~N9Qs-(5#Wo>ZKj?Le&pJ}9fT8LULo^?xPT}8
zN~?`j=1}ULqtOF@aDSZA)UmkII-x{6Y$rQ3q{&~+^Kz>7qv;MFyl`cPN!SzFqY>Gl
z{Mi>cq$&k+fjlv5QKN8e<s_7W@<rQGFJG%M;lHz2;F42GC5s8*e1iq!i(<09QLfUc
zq*sYRr@?!q5+#}^Mj~1BIj*)lR>FNR%a+BLrx+=%igEl0+0KxW9ZYp&%k6IcG8cr$
zN904kRT<WGk4d)0V#IB0*~rTpvbS_Uhj&36H@d>BG|6!Hg@f3xm+R-t%*nT_pL42V
z53ZWMtCi*E9AQj<#ul|4x3s63YzKRH4*aaUS@a(J_}Zzt<560*BJVu2E-rqCalf&D
zwX#21AGv|d6Xaey9M|xe;J-xT`Y6nR@mFj@E(sT*9kaTnp^7UqjHh_dT5dD2gq_`<
zb=}Gvr{vp=xAgK0fp(FeJbSCfyl*$M4g!u3by)E-XDzrC9(|F~sW}NrX~Xh1M-rSN
z3Otrl(1<)n0<$Vo*K!RyQJ8xQ;}jLLH1LmJO0IwIxUP95^!j#_FuBX_SNtPHf9GEK
z>CDVKE(Gg=R{LQrR+xOI*G=%vx`oaI0oV22&9+T@WExs_T@G*6+Cn2`qyAZ2N7d!8
znEOh`mqu6j+HY5#z?koM;n0U(&Pr*~AUX^f`R;@2mkkAZVn1VQ9r@%s&Z}ToswWB~
zFLv1ePCnAfz!%ABw`BdX=rleMIa~TTbUdyS7}~j6j?xHC=4iCVE~bmM=l=&kK)}CK
zzA))y7MRk*I;u*Tt!A%@<Z{q&@^?(e@9GJ6)8pw;cR?VnC<x2W6y^;B&jc*lhARBB
zUOb>wMp4o#HWQ4CX`mZz8;254-nq;?p~`ZFf~AA_lG+)2qnttGLIEX4IeE)sb7Y%K
zpqc2~>Yu3{8)^@(7k0SNJnU*ugZ6Rv;}kaELKOk3*%|@}nQ={%>?G88q>r#uGcBX(
z>q^y@k$a6rZYF;mC;nB~#$88b_`qX%YZA$)WBGiLOcYOl${$6Gr}Ho|&^j72*oxR%
z-m&Sc9Tl-yRkgnu4XJMk*d&0Af7Rvgk9UiR(w^YP^cC!AxL&FoJb;^4CwiI})1Wz_
zhQ9kg)G-)rGlDA`zS1e`I~ip%pGCwnc}zD0S9L=Vb%#f?YI{xt+z_f?#F9wa4Qz2s
zT5S)TcGc8PQ{dRrNb4q5rzS;Ns%lx}2^CdjoBeIh$yXryjLo%TvdH|<8-HT3?WFY&
zd~0I=icV8=6XP>5w@X*D=4Ek@62T)N-W@(q9;RzwT)4h?VrfLwM<cU+v<=XXxsiE%
zB98fQ56HS!izOB9I87m`rC!3Um{oYS+MFh_3|1}$!2)B3>vHYo$MxtD%Sc-cQE;~v
zUGru#+&19Ll0*!o)D4jWxbbO0N5*=M#W45D0=~;)z5V90V9SbbjG+-y>tC}XE&6?N
z?MUE-GtI~Dm1z<9a~7z@MAQ;rbp7qkFupo|clBjUNB8qko(*yDT1gB$(L(?#klSx^
zU0=kfC+BYlR|MOrl)BI16{0S2bv>jj2QUE*+lq3Mj-4%<;;(lB+0-rZ&PlC|Lg<q}
zQ>@M5Z1C6|pF^A5iW;V;S+WRrqy|0oM;hGKKBHg&_4K&}@8}x_uvo~|7ow2<LaxUw
zHxii%bJN=8e;MlG!8J3T;nAf(r5xcu?C?$1%i<z0rc>C|p6l-{Yk08FannOL#~!>j
z#%tmL8UL}42$v!l|20kj8k*5VJ8aLa`ed#qx8AjDyY?YTb@NWy5q8k053B{#u^gGK
zc@ZWmy;FEu#{69mN;IYh((Cg9yl6na5|j_G_0}cDtN;v)I6P{!E%iRAYw~EEYrBtA
zZ*$JXyCK-p0@gEkOr_ym%+&dYjqYXC3+K=}-sHs7*T+-c6Yf{lg}$6&sWsKYUv+bv
zZ$H}K-+yjGJKR1iW!0qNN4&R|5QzUSc+qei`i<dr!`1QaM}r(bE$ML8Up(-lcWV0k
zD493@Htri*V<U6N>46w`yKU%ZJy*~j@);>(P1_c&Yr&I_Ht^JF(2i&CBo;c*i<h}R
z{tWJ;tSQFXb?iAwt7RtF>=>=iAM;bxO&p5;ZjQd#t#4})bj-Krx|@8lrob}{HT31F
zW^StNdpODi+c<=a+~^<-7x6r%ctYkx*-Yh_i)*X7mPwzgUVHkRJ>)q}vs#rpn#7T+
z#;G74p$o9}>Fc8C(g7Wpe>hviL;4Fe-tSkq5P05;l7JLI!c)CX4*5VWv4AJmWDOn(
zc9WUJ-I3V9LpOLz+XUvYyA<4z@{>%*kAzKBAM)ue{&J2N3!ZjcaKb8tcRYxao@BEr
z&bSd^+mohxR6xNM?HpPL{J<XRW4*!Sa6nxj;jeY3K{MWr1I;+Q=rp+3k6S03oQ7g*
z!t~_J?Dj8or8+af4Pl~bCQFlz86k<0%|O~jZhIDbkdF36C;(i#%7#}P=*#?BrvGX0
z+nd`sl05&{r|5w;!hjcorHR>%4c^eRG)2p_MN&sn_Bdn(&;*)dTOa_V0Z}}kzx!3b
zs<K`Ukdj|>i<q%UqTZF2m6?_K%Pt<y&JeU8Iy1w1(oq^<X#M=^;Q5PekRZ;(6+8Lj
z>x2DPTPS?Y2nK%t-Bj=(d?H&f6Kro|K}TOv*_Ia|WUY6OOhhd^TwtSuf2?KQf4+Bi
zw$-YdPXFarlI?QUveU!!9()Z0(+h1HP+Tp3t%pzKE+#dc<2yP-qi*w#;yO!p1t?b_
zskw_AAVz${?raIDPv6>^1bT{poBF%aa<zK5s{7HuTjg5ifA4O?;pxv{J=oAt=o_-|
zuE@eJ5Visn8ZO);!o31^&5m|u_}Wqh`jYLS<vG!o5gdX!cZHq+&gG|`0Xm!EpA}e)
zK)&i1%2z;Hg;30<+lR2*fIPGVC>DJrGBmlvSUm<W(7vZ?qZkj{z-qItDr|#AdohMX
zWx%B??BF-H<`tNa$7^utCM){QvPA5lnqX&=kayCO_4LQeq8Pv(=wCUWv<!9`p2%N>
z0q(s#|H@;4xg(kw3@F?rD;Gci(szsV^Dl$^BT%N>ei3*U!iVWY{FpRMUssb!=+)%-
z0(e^im;>&j;gDw6mv7a{&$ObKOuJI;dPDM;s#%ZGQA$?!<mLH`m*>N8PYx|#9snel
z8GC2Fx_tCsihW^yL+Qwqjf&Cb#@bwZ1s;sK|LN(;>G1U6{N?HK@aW*|Z134YYj+Qe
z^OL711#Cod2KEbTPhJA8lQsVNANi5u_r1Fi%|3xC1n%Tu>mbzzCr64~rj3AUCT&C@
zhE<WF$ez7`z3xTFa*2;wU_~-#h|zJ>kd0XFx1e#<I;nTd)LNi*W!HbjL^oDqk;6CT
z^1|%Kkg-l#REsn{_kOw)pL*{X_#{9*2LyRCv;d&-x%CcTAW1NwkP%GoR1N)+6Ls8l
zS}x&1W^EFc^X5ZE8Dv6?#uC{SxmycAOV(QIJO=*>EsgENc~~5w{f&V#@nyBdKuL1k
zn;Y%LSyvbCMIa&+WzWi5r1z~+(ps^Q+s(~(O^u2N{lloOjndh=z_r#YWAfwz@s1dF
zhKBzIYFS`lrjXgUv-u5Kq%hu9Adim<)G@((hV-a#ec49JR(K0=RbO9Mle$E!h+M{S
z0>?%oqb2gMNEqMr222<i6^Ly`SJ^fR#%xlLap(vJQ@vkI>v6qeE!8<v@*-T^+6N_x
zW>N5u{O&Mnzl;;6__UzB;f*0tmi;nQ1-B+YG>BZ>a*Lm)$#E-cG*{Gnzyp|1(ew8G
ztz%Uv$N}$ZD8}jit@Fx+3LxydK&tn3C9l{|1;HAxcMOmPk8CPqLFm|gvb>eUgLRMR
zt7@l!XhM#e;&3|gvOq80hVH;{G7Ja2lHDTj1}uxbKyaFiX?cAyDcy@CT2)|<^*gve
zT!gnK5hp}=5fid#PHsCmK0gH%+r#5?(f1FZwlIBjq;UgYNmH)_>BDwx!zdK7d{f;9
z7cYF6bKWrURLna8u?KBx!J!5<hT@B0$<+4VVysEjkO-5OcDxIdhi>7I14nR;d9A>N
zuE_jOue4aSVHg0rArENb3_+VCRIi3Z3ztmp+z^F8TFR#oWMKaJl*0|wOV@f~#^4VL
z0t=^?y@72utnO-?aswMI?1D-B0yGK`knx8Hn#Uw!S|t7@5MOmyL!eG*5dJuXh@|09
z7N()C<uFWUo0bW+UA1gY3$EP`K7n=a*z)B>?V+0mZzDoT?4;LvV1X#Y7I3uNv=D+-
zgbiIzwrNEnX1E5H>lxNnwGfSCh>-a!R14t8t|AYs-Spx8!I2o4(J{{3sGWWzo7TGP
z$#yuzo;`$Nu*Bw5C}F)4a1x<!VB~HXX1(#SQj~^x1PrH!>1;l)!RYUJU^$ct{_%8g
zJqCYq9~R$9FxVJo3CC+Cu5b+n>Ngh1#q_Lhc{Gv-t^EjYKJ}X!KfYZL;;r;X1CF2i
z_ZGD?S^3rLR^qg)G63XhrCxi&iY*kz*=I{((1|8w=XNqn3^XNF1zt3kep#k-Xr0};
zcG5Bw_Z^yR1S3%{6523?@Ny76^d-5$ft-YwBFUBW*=79(oGr^@q@3_ZEp#1iM_&+K
zcLQ|%K1QhRYE=Awp<G52utoSSE(*O6@*gNyf`BkpVDuY;AA72arkO$o0M_AWM~(-&
zHnQ6Cc6eRROns2}6OWMk^^|Kn0}TE_(UnL|@R28MS_yJe6rfupfiTGAl?1uF!1u~f
zaOsUf0irYD&nHWA^G5-1bawW9pXN70xzDRSB8P&jd9$+btRsTjp%70V6MEvI4brRL
zB?E7dUb486u2HQyP|N5ytNM~G5G#5}1$qf+0%Hh`N&~0}A~35(igAQCERTl~ue*B6
z*t%w7m~0{L%aI#F&LF{zfQH7BAqUk)Iu#6-1S^%&{Q+~K1Nl?X(PBEEzg;iL#koNc
zom=~2NKMx0_6CCYg}+3iyV;ns9&WV&;&79;F*66VRrMI1E1+A2Q39X!0*)H?cUxT5
z=<`9Zu8llsDz|I&vw7>vL?dg8DPrnenvF(VM|xH9m_|T8ZmS%S_r<N+Tfe(du@wjI
zTfA-q!QJZo(NKs%_=BNc@)wT_hTt6Vk#tFj8P)*l8QHSo<yttjq!UJ+vBXC)ECIzZ
zp)s)Q5|l*4tx=HxT^m~UEdWLt0j@n~s!?D!@NxD4X@-t7&8KmGdTM1Hn~XL|=y@fW
zro3wki^S_*nH$b-o0YlI8J6<gu*xGd@Vc?SLee%z9dXNEbFLROjwqkRdIdKMz37AK
zvPt<OsyQlVzauyi#I_BW^k`8I({(wi0zfYu60D}<tnp%I?huVGRKd})U{(#_mk_Fm
z*OPM6?RtS2){ci>ak`lv5D1@z;HuzrAywF}1(o8(@;#=^xIZr*Bb-me5K9=V52DK$
zU53lcm3pgykK@zgDY%-Oqv0p?u)aptl42*4JidIp?y#i+LkI9FD@1%je=>XtVWb#u
zY<QyyZj7J_GRjPd5+il=4UPhvT#vhQQ3gUJMcLbEd{@t2Gf>t3k|;zRZ&$I<R3T<l
zyB>S<3%cWNin(6`-Q1lI&`sTML0cK>hL$s5g$ko`z1#IDEt;M~1_*JabxgKM!a4Z*
z-#{StyZ&`#;xH*x@NO?L{380gpnk>M5^`}Y*+ycqFL!%T{kxkpQf(BQCFe#(Q3TFE
zJ0tf0UNZLYDxKQx+|lA6VR2RDeMdzK2~oB393^bg-L0;~4;vri38vmpF+oiOAW$F|
z!P85i#$)_3#(W7J2h?ksgC<FDKH(r#IiCv5v4i9)YsO-8h<tVk9wOfb;DC^g9cBcc
zhhBST&ECKWQ7%X~w}cSIS>0TvL`y9|9R^ZvFeT8e^n&_Fk5OH0aH1FPX2s;YNH@@m
zlU6=yJjIMfod;?mP!(g?R~fq6hQz(>F@&w9g#f%m=t%msFgLN(;6?+T7{QM~!6M~E
zuR1$w+00ITqz_uCKDJMetkgwT$v#JrUwc;nL#DJtQX%Suo>#lq5}4Ki_q8-?gHZ>~
z3fl&}v{Na%`Q2nR7zbw9V22zL)gnnXBerF^yQgZDv)f|irPv&?OH!y^ln$$a7l=TV
zEy}^nuv!3B@Vi6(>Y@tSAgp#MLz99n>OZ>K1|y3MnPz3`OJwn|_*neF^&EE4C$neS
zNDf<{2@orN3pjLhi!JOlo8qjZSRo|2x%J@UaO?WgYfQs9+XrmP@{2ykm*u!pxhW9e
zejcAFR~9lc$ulOV%oJi_@ZT{t?%Y(+Mf$@8dK3dm82h6M^!Bna-`bxuMfAm|iw{#5
z-N%&O;nw!}G`NUumfb?*zdR8;s4<<I$UaNM(Hsm0VP$aRVlOwFqqG1|2tb%yI`_9k
zlR;D1L5o;g?A|3!l_P<$ksvRXS9ohhCd&co=_UQ$2kJsk*Vos#n3dVrexi4$D3`93
zD#r+&z>iZXK$-Ki`y^X|(~h*aIvj#@vE0=85TPjT(-<%ihSD*=TtwCUrx54f@@!o$
zCmR-^Ux;&>JA>hikV~r+7$*5!7X-B%=?ks17RLbdSG|y>ytrN&uDko_NxFvaSihZi
zp;m9p#gqi!zDK+mDiba~dO-%mH}$(}h)L$OBUdoEHs{c2A`%H5ZYp*dsq&{}K|(_)
zTtR4cfcr=@S}`IzS=w_5lUuQLcuKI0kDBt5%&)pF&;!I+J)b?ARES;bT2ldyAO$>f
zz)H^8Pj+sb5m9L@*baNP`WY`^9Q@#1;fUC*iZ2FEyzuAu9bSHzQ4$}7#^#<h(%TLJ
z2YAnT;(G=ewy&=tujj;~|8p3p#t)Dtyxkc|f;kH93d^_@#k74FM$j<=1T%encCbPQ
zg`p`$ny;HCzieuk1d|$qBpX;k5LHkTpsWN|$r$`PPnV&^!hfg?v7{14miT4TR<gFK
zE$kn0r5%=yPzP3aE{Q30R;^I}^L9EG=E6zs*e2gr)k5WK5Ic5b!2|+z>vpW6v%_aY
zJxdU3SZMo;l4fXMLa#EHBh#M=w_vkk7M<ut-ABPjsYmq(JHBmaP<KO5x+~D)6MN`&
z7JqLpq1Y+FtAk|8<jnsv8588CB3bui>@uvWRQ;^xfZRPf!uXFlX*@1DXcN#A#bcSh
zZsf|r6xg08VQhd;jUZ!_508Pa&3|eW-4i`!E%B6WXqP-~y6V2q_?z?XA9&!y2mG57
zFH#bfzhsBsVme10ga{Q(J!q;YtJFJ_>QU3V1G#H=8wFk;0-G19+RygR55C*`US$3a
z4MZg2pRmY9)oq>*OpJ=ahlX=>aCBm3<A-BzF8}o4c^t)=tfo)`lizoKm{mW_g20;g
zXao4PgO=Pez`*;2xF+y7^2n}Mp(yE#roa<7Y?upWIj&YW^W|HP=TU}K_1^4ad2l7e
z&Gv+Qckm2TS#+bwjYA_hy-zqG$BhWwxSFeq#UUX-sY=B4f;IB`ie#@?vPtmgewvx}
z5`B2U)0@Vg)fO_!SF{;w{W)DgQp?yTj3)049?^OI4rn{DgeSmxWS$cLU=s`mJA^1b
zXp(gRHez07T}2Sg6t=c?fYBosxr(e^JNSG_paDsj1~28$weO8}Q8^PLj$v9~RwRw_
zOOm94!KV@dXj|3pO;oncv3)L`=KP6M5gh-m&gCFl?_Nwa8eq%&A`k^!G=rj~kd?H=
zWp%JX64pufG^U(rP>C5i>e82v93GD91uV&1Rx>W@S!sUJxf}!H3uq2g0<jXSfHYAl
z6MfLPpmRtGxmuGZin*J>h74{#oT87Qez>sj-=<Yj3I!ioJu_i4<`I(?Ga~4Uk)8(v
zT>0~{LD8;Xq}@w%ZDFJr=LW>9*W6{PmlmMD&@;-D?45rUKA=qy*#y;i*DLu-L-=T7
zX0%&gCpQ&mZo=m&dW}Jwf{fJq&$vFUBMm(wn34Q1)6OU6^y8C*V4na4QhB2fAg!5e
z2zP7go`y|h7hoFTI6Om}E=!Vrki%y+Wu0FJ1{w*E^XnDe`H}~R2?JJvV`XS(z<?{#
z<wwa|#pc_@C00qEEYUF7gpCDvE=b=V6<wG%mmoS(gNZ1nN6YkPO<B4U1`mqrYvvYv
z#iNQ5Z9bNw&Hoknr9z<6tTPL(p~lCw8QqjNGdi{0&q+2zT$$um4Ic&_9bT5Ja!P>D
zU1_ry_Zry>iNgg=hIU5y@RFs3kQOl_Wd87Ow+OAGh*%&^q*e+{F5&H>DytYJ)Aj&g
zGy4G?SYj=7+ny4eT`Vje?J%tjJ(3TR-F5N5eeY~9W~u#v>knSm;66(r8&YOEo8_tf
z^5gjkp~gy<z3KRNqkp#9d>LjYuwFi!C(t@#g>m5cPjF7!gn2$HEiy4TmU-)lDyq)B
z?Uew*)Fy|PMM<YUBXc+qC>m3DeN2awW20{$KXpC2Kp0>C;COD3Rh<=^BDFKFCX^@!
z^A)g-9swT7jmg^SUF_-wyv0|SmtYby4BxEP8z*)+n9xX2R$cB<W@gXu51UgOX-H%t
zmZzy#)J;7<D5thI;^r?mA0tc503nlHEFeyqNG<b5O)t#Tir-f#fYE*pck2+`J<I^#
zR9)}{F%^1SSTeR#aOL9#Bjp%+yD|))_IJDJnMeHNtjD;L93;-YG&|9Y<qezkTKREX
z>oRII-`2~O5mAey5>m@mgSH^Ol&+k#=l_8X^VZijWt4}frg!ri1X$GlTvxbgZRYqt
zps}~S9D?WvgiTkek(Z!7ysLEAeb;R7)#KOZe}S2nh|IP_|8%*1h4BiG2Ir>-dq>`z
zL1LVq`&L2&jNm#qw6Uo_?ID=!ZleWy0MJhiC7M89aV_=PBH_Zz^lArTsI4maZ)=Xf
zfrw-VMgY|K#6zI3`y+NjakK1)AQ`)AHT(r{!9XcIHMPBL1e&331Zb2?n4JhOB2d0A
z9784eAXJoiuSOPCWdx^Vlsx)rLmksFx82PixRe3JA_b_1AiWzTneKL$7a=B9?o}r)
zHPqlpY2|5KP$#5-5U;7TPvJ}+29su<;?WsQ!68JcA>>e^;B(!)fh4(r2!(n{77Z#4
z*@8>26F-A=*k!K!^W$H3etz;x&tjwHHm-dk%=mI}Tb^Cslru_!mQ-^)6u$;!?i}SX
z6<7_gfigN~hCj^A6n~iA9se!O_FBi_Rq@xvVqPY6o$Ejq=WI(ut7p`lAIYXy(xq$8
z(H;w54~spGj6?ZdIhFDP3B?(GP*|?YS)!4Ei!^g+2n!eU_0r7eUDXWItEF_&oEF#$
zN0wIIzC>iU2>&c|_7ZAJpOop8%~WAf1)NhEw3tshrgM17>Nz7+QKt6BgW#v}S#<a$
zf33NlkjdIS0E$hvd0RaM1ih8=G{2}W0eXQmC<wpcBnr%gF^@Y<Ot%$cJ*7qv0U}V7
zZ3TiB@&bap?0((nmQ>{WJ)c|L8;fr2KF=-cjYT)r`rLBY$J5}|R7T)-LhS8sRj(%l
z?ZOg?1smzXZn>3z3A{1Gogw|k)06Lxhw|XBIdzCC*#bY7qP%wMlM?N3jT*QAFRqXw
zhWgFbif(N1|3{ZST`~9a9^CQC2d1{S_w18p_EG8gcsi1FZPC+_!c_K7&>!c;=vRMg
zNFHsUT=xCh?I2c}$T<$Z)^-w}(uc)&n8yX#7}~ZlzYAVMnEa)dBrOEW1kNYsGP{Mq
zghnz7yT<6<tD(3Pb?<fUowxb?i}f{oq_>cld6C#ot)FSx?$&Ob&PcDZ7@cSuTJEQe
zrESd8iG~`!O0ms6ns7`Ldp-^NG{NFyBtO#M$;G1qF{MjW@X>n{ZvuAc^uwzsJFf$x
z_zL|u8iqaapDYEy5znHLPKa58`}B-*$pme2Z!F8Gcs2zL;V`LF^IO)7ZV&9)I^oHN
zjh7|G)9k$Fz~>mo7;1N9L#G};3f0n9{z`Of8&6X1>B0WN;kO6QNnhouj6L)P-Ur{r
zF(XR19#jL#KHBrW#hnM?h|-zN<~R02LuMp+HRv8sZs8zgeMwL_@K;HxsqPm{H)_6;
z%%DXfc2EH_cXQZ`)keuTrGce#s|?}k!%z|g*4f1d{4Z={^gtT`|6>v7<+tlPOdGsx
zz>5EeuuDqYaAnS#(?Js%huz5OPoXV#K9$Pbq)m?I4Tu$FC{DHE3iuUm2Ga?G*n!cV
z_Uper(;n@fDHBKEJjA=6zm%k(bz|{l4@toO?8WWV>fLA6YK#b>oT^8)f_YqAdMm@S
zkP%5hB70RqsmOHUO`;{<#-D3G|3iil`W(X`*&Tu>b1$e!brtw!0)ikPfB=jTv`74!
zAok?X5TX|U6+Ky&Z}8NaFQbQa6dc`qxG9rKKB^nERALeJvSgHgMyD5iDl!<CGl5cp
z{xdJ{dggR$rMj!yPYotEcSH**{*Cg7d=**b?Q2{Zx#4Z}ZTi_f)kD;Hx@xjoNON5&
z)pv8(rofAJJ)LOE`!pauIZ29-D0ph*`}WP$mFYpWyg%gRNIXXy9EA^~8P8nNJpAK*
z7aSA&9sEhOFECGb20za0z(L_vlxW9M7bc_E=Sl|gI$m(ymzt@5I{?`-v)Wqe0MHwh
zwpiy|TRYe%<d*SCn}!6eH{~g1a{=P3xGV}Jm*Y@4Ujs_6Vc%P6+CW7}pDH4QPnc_o
z9`IVTEv4?ypoq7r_eq4*Pdv@HWk&LP2pwgVz7L;=@W#ycEhY?#`9w{4X|!BFL5$(b
zl0enAk<VhU=a9XP21;hK0)`qQkhA5k04gb4x|2Lp1EV0V%JEe<Ci!$oF3^`1=t4UG
zh|~lD=)VIk1;n%s0Y(0QdUsP9nf|C7t_SJqC?rC(>nLBVto5kdQ7kC{X+RV&vLy?d
zxn##483EyOwz`mPqbej^Mr<`QE*Zgc1~5L(3pYRn{Pu{!Or9BiUm1xdTKL<eyswBP
z!1O>GNuzN7DB@RB1<<m#_0(kO^lYKX4DY1L3_Vo6O-e`(b#Ex!x?BAGOYjvI$9!qT
z1a!y?mw-%>!^M2Do|aihhOE*8pSRU*(%KWOh00<RFp`ZmIgw3*|F}KzB2bg9L<^dQ
zy!84JHj;I(kio?EYvpD!9`B|G)i)<%H<(E*l4kN0p_G`*VKg2?+dA79d{ViI%)Y3^
zL}oV`W<krd%(DVbG}m`}rolAB@Hv+zGw8)r^Md{UAEi0B(jTHV!+t=92+wd7#51Bn
z$O!!+kpvdVhmfHFmjop;0{4a1;ZgzF3_U-&E<D0>zVGx)iD}N}6*ObBP!`ZgE!p7l
zb+ai}H}hh#UM}F_I~>!Ko@{i{g~Nlk+vXoVgb)L?wHqXh*?7Lhj3Bt%^=uM2VsdLm
zA!0wl3&CW7qXfnN+AH>obE7?w1(VON<hG%VNH&!r<t@6dKncVtua=?RxF`hWt^oQ=
zSWWlL-qjcD74`Ta+BbUR@^6pGp3x1jsDwG83-J%|UEdR4eEky(bV+Io8-RYKjf}Y>
zy2yZGWtzh!NC;E@0vb+W&MX5N_Cud+hv9KEUpBBwW<{H^xh$6TJVe0;Os6sEk~Jj<
zoy^w=iEi|D%q-%elhaCv2vgoJyn=bXHp{0>)3JpD&A4<j1FqOlas*F<0zQaTP&YP_
zb(%SNW76zlQYS!xsE!B{v|&vgVGD^Ox*&*neGJUCV+!|*{~PbOm)Rv~KE>rf#@>G5
zzG|O^P{N81T6oS<zV5zm_Vn<yRW*~3EB)o+!SkmZlPmjM8C%rbddX{Y^6@pgP&(Tj
z(&dO~OV!TrIDIq{C#b^&o(P$VdC__?G3|BrAO_~ztyf)GvQYfG&qlhEM+fI$oou&@
z+wb5uyZ50xcjsOEI6Ee4Pi}L(!XQczTbxnF_!w?~efryQ7v8SNeqSE%dXAMl=Ur?C
zZ@Qn8f?i0k$VRrv=T7CG9h`5X)$Qn1z2SYB8R>4MYoFR($k_3TIfX}G9N1$EmqX~!
z`s(rPFo=~N4H(AU<@|<EY>WT3k4r2&$pFdO(-Y-9K)#=>Mu3vDuM&_WL{hQg_5&X@
z2%zoSWOP^^&a$Tr8Wx&t3MW<lv^FhL&CLr+101Mnc#|^&xoU+v96*VEFj9E+npHbO
z;$SsDLw(V&wyYM3dJn~SKuZ(cSVy50)stQO!mV{;Wi-Tu^^whRkJ4b_m!o1CSr94B
zy)qy+05CXa7l?)t0Ux-Lk;lToYt}lpq3;4lkOP=6!})>&h{gs)%v~-m?w|R*B-gpV
z*Bm=QsG924rp8B9vkB|*3f#;u9FFNqvh0YuNKPVtOylUsch@5ieZCZ(yI)j;Hv?<d
z8Hu~CsWAZD)4{Zyy&2Hv2B~Nh2Gq-+W}Cb$hNg@;@aDfCWL4Cj4I7PMv1`LHa<G75
zrCcb6u@^+x-(B1w`~pOH&HwmFWNUmhb5ByA`vd7xK0zec^1~$Sv-7?4!~N|AYj+_2
zr9^6EQ5j)6Jd_)SYLD=^-m9J0*@KKo(!f!Nc!wpzA)Zwn1n8JAT~`9=LQ(nd_Or>M
zoow*ax;8^*bXrOIHK1G5$n;P(hhpLr*y+KS&ky#`4^NJ_kcXi#io;*;eY-b&zIXg=
zxc_|b3{|<!kE|ZKZTAR&H^S0q4ZKcRjx9bfo{$G3xx+qw?F8Yom;3t>Tq$q)M<vF9
zEIO@nUJwKl^$QVn&XSB7vL*53*WVubfrBuBUhf`Oans<d4$FldJWnL+bwaJ6)mHm)
z=#7<?yOY_6Dq#Lj|32hs^|x0~R8g{`49~xRagfCZ+N7bAc5d@Pq4N#iATP|yAO6!y
z=0L+OhJiNsI@FHr@MC9o;vX=Jgxy?CgVcbQLGZ5s8}Jrqb=Wkxl;_;m#~(ghvjAUP
zz2~7H{QTrj2cNqQ-zfiFuB5}?*Lk#+f>Q1MkN4<SPF^-8$z=&DF9<Sg41B$=-^12l
zmCG>d4B4jT2C-<pjTh?hU)2qDxRoDj<R5o(e0F~Na{v6KUC~IY)W4da+}#w3`#Z`-
z44dvkCpi6P{&6pxYWbB>miN}H`TiV|VkDuM@Lza@H|GQb`2YbLr5ef>hE2imY>EOa
zT=l+$<~Z9G+!No{)r~}TzB4~l>z{xAc`-N32Y(r`t6mSZUbH%ynoeDzZ*8rm*^T+N
zRJCTM^{So@PVu`$L+V>e2ja53$sp@St~YUnW27+}y&Jf)X;Eum+)878HSkaIwfJ3G
zqrZZ*jl(X^2}QS@OyK|M-BC`-kXSXJy`aT5d|@(o-J*eUar?I)amGSTkkkYalJIZz
zv#p{2)>VTx^0guB0q+&*gHKpVfND1T$F80#mlGP}F;dnPB!#i1B(h-=K%1TaB}a>@
ztkpEMir6f*QZ)ljNPXb9xAQgDyvK9^8-kYg3B|a=6;m#4=6W||-xm4J!S}!x|1M-d
zUn+Y4yyF>-c_ngYWB%GR90NBcvnfUXb1%w-41bri%nK7{mdROwZHIqo8}Pl1zvg(o
zyB^k)5O?k4Sb`0yQS)u4M=oJ)W}F(tFp=Tm(+8Qd^utE;A@l54Y5qJgMEUw2y0y{b
z@(m`95$WQwPt?cJ+PCnGi;ybL9pVuJbI1|WNJ}7*5J`AS+d!1Ik8C!0dHl^W;8Sd}
z8_2NIrU8#|elzQV+KTY?maPP)kNt1<o*g9VMRP%(Ub~V?^V33YO9;A*0JZn@%Vvwx
z&gKAZ0QmBMAA|M!^nB;_Cbes`>b#|SjF+XS0;x&tNM0NC)d)am<~M>U>VbL%NS0on
zV=vp6yCu-B%Xf(Q_Pg{h!JB3Gl4GpRxnN(6o`$TGlc(;GW^*ugikp-Dgn2ssiIybI
z<nQ*ZCmk>k_KrP(RZO(*mSXqFzE#8mQmPx{R@E(|I5<5x-akMClsVh3l|kUl?JLa>
zi!*lh#WZSGT(u9z-T6$CFo@3Dv)R^yrtrbY;tvhN_bMx}V-Dqdc2zBFij4`{ai$e|
z(>g3}u*$T4TRCJ=&N2(sX((-4r4oEzA|Cvk_0))E2!JATV1_U9!tbi}3$jq*<=OzU
zty!KSsF3uSwp(<G#$LVdZM=)``Z(rF!5SVOe7Bt&PnOF^%6Hq;UOj#t#d`cnl;HMP
zGz6kiePxfEk*q<N+tP9EkLk~xC)<SC7^Z8sy)QpL*?Da!sZDguJTqn}>d9;S@c6q(
zo7$Ff2U5*QXi0JIe$t%(u%nK#*Ietv<Hr@avhNR8Taq#UDFv=!pCEmPoMnaijrX@C
zDyAzBYQE7sT+U@%FB<)};v5ivFt8jJ4?~|yp#Tc~1#8fe9LQd4&NOXPnIa-#-)Y-H
z;@XXhwR4;AV|o=ck4#h_#gYdim=}PV3$SE=s@PL1*Oe)p`^vMn@UrEsEh6?(a$m!5
zOS5~JYqq55dcr}ys8%->!cno{^zD(BZc7z3`)Lt1E%q^xr!_=kX<-gjA&S!FmMwUc
zZ5+W7%p{lKb6i8l8(La4sBG6Sx4lv@3j=KN1vMB9B{v9dRU8tL-E}ZBlDRjSGj40!
zae!gT_UrdbB!WI^_=v-r5sQp{)?ywA18CP}QW^|aJ>uYCimziU8ph$#N%Y4={+i}T
zNU2OHRBl_mtqn|@BWoF(%+F0%-QEc4)^oU_R%}VtoT_D=b^u>(Sx%})^UF(1<oF0R
ztKottt21Qwn6EflGNc<=H*i6>sBe+966$iSy*YM3K6U;}{sPY$_))%xIp|!<-=BYF
z?&lY$C;$9Dkysh8sG%@GCQj~wQ`pi&oBXXZtYbL@l)}_FVgzR%e$YcyAT=8<sy61H
zSt0gjUfyjvJub8FNYr+37+$)F;vGn+YsjENhIp1eg?wVr%<PQA0ePK{_PHLN?Zg<>
zxYA=v?MT6x&BYNT(6ZAg0bBwRI4iT1_1ZZBf76?~QG3}yYPAl53d&<dybjsT+!2$>
z5@h6NMI69mUq*`a27zAyu;G%@N%BjBBqx!RCZyrtk1=JmXVus;cXYn(h|CfqojcW9
zcJL0K{w4O`QRmV97-o;o(;~=l*)PMTXd&r9Z#$mr{3|!rQW0D;7o>EU8PT4!oFqMV
zG5BKd?4bD1zXh`34WYxEd8}tuGcFg^5G;a-=$Y9X1uCCPa@n?4xx<;jR>n-=AzR+W
zKba<Ybh~zXUd-(NRb^HRtUHn<9W!lFE>>^Ta$%f0xw^V>l>mB&1nZhU@6Zj`xI;&7
z>9*^}mA^wL$qO;tU##Wm0VLZZAt5A;1m3iaI7f3Sm*_n4s?~_ndZ=VKj9RnR`ucV3
z8b=e6`J|R!zKpfg$Q;eSE0pcc|F#=4yzHd|np5t(z}6Y<k*n+embqeFs+{Dg<@>I8
zuFtB~dJ$j+`Rzlpf?(z?vGvqmz*_CYaxmMFN<&<omOLT3b4d-mZ@ayywXNti(Q5K!
zbvHDrP19*I3M|zT2{Hv!ny_w?T$Ny(S+B@i7`(G&2WhD$GfYKwp$TLgiDjTA5K`V>
z0G+>>&!^p9f2$^PhK6|O^I~CoS#^ZiV6o0+&dN4x@t2R!PeBnnJU$<ueDU?c{`v6m
zX?9EOVX>Q%ID#pyEkyk1SP}nllQ$aZmzu2#ehgJ{rrHGzW?6&tRkY)9r(kjuWt%bD
zKKev4c8dA67iNPr|3gQ^$vaR*z=s80a%NLAiakt9flp89Z*)PK^&CKWNPo~899Qt4
zQFGshgXBesYyh8qJ~}*p@$!7Qe{%fo!Rfh5{j%vs@!Rr3{x(D@hA@B`j=JeRHMHL8
zJ4)f~C2Z+Q8r~R)PgD?_qBeFXc|pz&PR$zMxyN_lQz&opi3=k4biEt<0mZqa>Pe#5
z;nTZB%6Ku{$_S(@DwIWRQ(bBSGfro50|%|MTchJ%4>&IDyuPlE>glv@;+$!jL@0Fg
zNv@z&;xjuyeB(^qjyWK+plpx?FWJ;an@&D!Ua0Kd#dM<7F?Kb#28_`3AtsMQ(q2UY
z?L#IFbXi>c$%(>&`A5NVs~Cs(6xWi{Q%Nc!K3oX83&Ruh2D`n&dnPJ?p^XCuS3+W6
zcr(4Vb-Rq62{S<tgll5MkSJ_cNipAk<xeM*_Qbi}x9L0{X2N~#0<Eg`2H`FAM2O{8
zPtp!4cFS?Jr4mW@OOj|h?V(N;5gJJLF~yq^&ux}Bxmn>Pi*3t3t0f+_GmM~Bn)i0p
zaSXM=0U}FgH8-7q?d=hb#MfurL{lyEWgcR;+}GW<Ef1C;%L~D-1eX&CRq86)?TpvW
zYJLqDS0_k#hsy_V2c|B<$)Z4pKK!-ehLCWI+^!oVaJd52tXUgz&e5Xad9zFUbvZ#F
zQ8@Wn!M>}4ZFpLbS1vuX@?3`xJUGn=90eWb9R)l?zyWmHsLJt`yx(2VCu_`-g`yy`
z9vM1Gm+0&Up68_kD?m%_MP-WD^JOn)@vv|^#mR#G^C|d&7^ldsgW{>oYG$*Ad;7hf
z)jt753}r$AsKrU$tm>JWJ~Ti+dE%}`ULu~K8hpC=ggK*mqklGD4Opvm?bVZBg75D|
zK4V7MhHmIpwe*C3l!R=e+3n)G#p7wSe-y|)`=<v+R2zzqRBNrs3kpS?-_q3yMN#W>
z7I`|gIXA`8I7#_3cu3nIZlb$~S6_Kds=@Yrj)0gk%nq;=?tQ=bbYdjD89e_rIi`Sz
z1nVR1Lp;#PH8H}sD%Z*=Iz+B4r2^3De**9u7bC%Ui#Jj1nvY+Afg^pnaRAu^ZX#50
zfAy>D=UA_&(W@u@zrG%^Pop8P$$E^?J0q+J&Qa#;<WbYJ+rvfsyNlUj%wP1<umwIJ
zd40=TWaS`;E>yb)o<l)kVdSF>d>u#LP<lJ!8!OXP2`dD+snVdbc814%@ANdtyO=(R
zT#k<Uolm$U7mkXPlK7M$!-RW9ZMJ{GN>t!(+`6q3PxIe*e@)l=T%h+B3`A!>mE^Us
z!_J>6mB#kW)MzMM<Vu}7&p2o^e(Uv<lD-7wkuiTW@QZvtiVtKk&>c{7Mj*b8coI?e
z2}$ARZ7C+hY?7%kyQb$`^gEv3k~~2v;hN0lm64dk%?vZmZma%+$7(R=$Oh==|Hd|x
zfzdi4T=XT*%cYY`Elr?WWga~;3FSQTJ(8^xq?wVw?@X%yAJJk?t4#->!GksCmqlX&
zuX;lNy|Z!{>uCStG;V1ZbDQ3ua9yhKk3_V(8PBEjx1eR6-#mPeQ-kpplRqWJQ!eN~
z2f53xd>Ek{sXA*TZIv5BLg<S3K{e^ggAN%@$$Vh;uq7W%%YD+iV@bHf4}D%GDu~FX
zU%aiVMaJ9E7c^7cW2lZw3(o-Ks<r5<lWtT9l?zPL%P*_>qQd;8o*C9=>MFVza*gNQ
zQ$fg;b>v_S5>}4Xba$XBVJIm&phz5l@yzBT;eh|%it>3IdYrDC`dyehXq(j=gah4J
z!y7f0hILL_-x7?iEss(7D#r{Yv@s`_((8X~w;b$ZkZLgR!{Yn(WX@X3BWH_qi6JAK
zcOY&%eaEVEs1oJ$wY|kZH%H}Lx@yTF!Lu}#T?{KeTX0CFenYObOEp>my!7>qPQv-A
zHDWNU;%=9lg46dDtjbpt@n&a3BxT|{cRlps4du;vz!fy7BV%Q=;zM4~-p$|2K*%`(
zU#siu8v9w&H>>%)FuLU}&(z(=8nPkdVh<l}mZ2udE|ZuzT)TRKyO+zV>J>o@XV8(_
zvy>Gr1gilGK;%~1|97QPh8N`u%$@9&ZPv-~d9;2J*tWp72GJ>|!>9BOTt)~(yFiGN
zi}`wG5o^GcU=~veuZrn@#|v(?Rw}5d(4j6Vf%f=HcjYaM@NR*~og}=DX`VX;ygNSP
z->e7qWjzIGGRnJ&p$vJ8Rp(8$`r*?r=kt!Cjay}HIJz}|?88s$-!Do+{D@fi1;S#R
z5*QgQ;{z(ZE?46#Gp_PY3DiLkKeJU0nYrsXv(ofe`R_mHbE-s+87oPlg0UoWG~jKs
zL%$rGFS|c{+KE@m?LUgSL<6?brbfgeZJW2ZePljtSY*M~p+CqrF;M|7x9NoA2%t4g
z2;j8@5bBVSZ{qcuYc64=#IMw3pt_@1a2=Q0h<B$VnBMu@j^)h?)DS%_VK>my$eOik
z2@VhE#eF5jY1n2sLImQRKGA^-QjEQvwqxt#txvW+)XWwB8xO0>v;%BnTCQi~D{GPo
zq=}|a?#z$j5?Pu)u68?}KAn2|8!)2zV%6-vdhK7Ile!wtXGX-YZ4ik>^9&nvH?CKS
z5D;C$C|As{CZ*xsv-<y3L;0Th7IHxv3j7#c;QWdS*80VA4uakA&2qk8*q9dbS_8-H
zlwA277bl28lQ%s?8KOoh@=4&a7`MuisFEG!iv4>8IyF#Ve?H=94-f%#a7wM&>seJ5
zJC|mJJ0pJ=Q?2_YOb5Vh!IQwXJ)=NOkdcYK<DCv2&_i^@X$@simP>q^G$W=?Vu1n<
z3OQp?(cpwMi)u_~AF-uCt6p4H*YGAO<&l9$!Yk7x52A#3&p8VYfBPD~HQk;7Q*%N+
z`)HW&{EMY*=!QnI1yjJy=Cem|WlzfG#ORppY!7)hE=4fGafJ!l;O+1dZ}(L(YHpi>
z5l%1b_ak!Iz+9uViXL`xm|oXjOY7N0M?_eOoev*P+x}|rxXdmjR*%l@j!l37Y`1#9
zT9!7hXa(UMgp-18b;AcF*FB3~;4NtP(<QYY*@7c?m-vSjg#D2WYBcD@Jl?XCNR`6D
z3Hi&Z#CyZkLK!a*uguo*CQ4~&x1l^L#oHP-BU-V5d+k8O%Zfl>=o^<k7EAcIw>bfb
zXW3;A5hx8<15@znkjG-)3?|jPnYl3sE*1-;G#6kY@Y=Wp>&B#N#>;wv5M-sfSS!5S
zm&lpSZ-e4HSQmI&K}v*x8KU@A9XyfKX=)@pZmc~9%=)7*h-vfBwIK05g{1045(tf5
zU!w>LZeCWn@h?tkWr8;g79mHr>K%r#il=Wt!V#zp&@_k}qCAx|Qul&oX^Ji_cAvMY
z^-7Lz@m_IVzqz7Zk`!RLsFu@P(rhPz1k2vXP<Blidj;O3<{rK#0lJ<MI0!H!u&%=X
zW8SE6E5cdy(NCFft!aq{PQ28UoNq`hlu5A9yjYwA{|yoHzdAWO`0S#defA@UANuZk
zt_Vt(6w1sQiI5_-vmps5F)z=^CxILb=cjhwgd&6ZVk^@C*5?gpZz8&EP^FlHe>r6x
zV5uH{BQi=hC!yL#Hn=etA)P=;QtiiH!vP8}!~NBA`e^^Nv%_bH$LA0=`pJp}@8x>2
za_t<llHrrHVcktX;M<iic;Nz@bumOVHG+Jf^+(XxE-FV@DD6N#Q=h>w`V1`4eEsGs
z+8!ky20J1?p3pr+v4YM<@;Kd@<Z<fD1v3or0|k#|35s7t1B94+Ks9iqU=N7$ULvG_
zO>0aA4J0s%0xz{j0`Ka@ZmW_&!Hq15Db>K~1$}*U=Q#_g1Hte8NE--QQJQT!B_aNk
zRvGM1TWSrtFZ}9SsZc+AkuaLFyviIbI&`t1X8mCsl+iGPbIH}|C!-tK?f+S7Hxt23
zaWFHggw$#B!KC%0f?U;rd$~ZI2M-=REcW-FKmTHH|C{3M#limJmxud;UJ`heh5#mD
zZwX!uQJQEL<`%9QLD&2~t|+P;-Luh>?aZ$a?65Faos*JT&c!&f+UAWsb6KG{9@-4;
zf-xA@lJehbw;}aZu^?l*B$OeM;f|OR<U$+T4DNyyQBEdi$r@XH@9ERigR`?ch5^Ld
z<9sUG<9zCE3K)Nurd`+)mT*|!LS-bzYuI3x4MJQ~@mLmN0Vn{QvJzf&Xa6za9rCdh
z(M!wS;*ox1$bi<z8-L9PXbiyAD@2kluQneicKS|%|8h?FUp{2;pD0L~Vv08f!p337
z-1HW8xujpcX{PY7v@$yW7EP$q;qWs+RGJCy7B^M-c3E8xzBB)xRr;0YVex!!C|45U
zK{m!_4kEs}dTRs_1o%qefQWsmI3nJ_URVS`d4k{dd{Uv;w|JIX$AX3=tR<R!(e_|;
z`>BV{P2h#M*3djI*Pzb;rA?}GN<Q@if38!Q!ESL1GYOyC>t-mgf2N3^Zd4Zld2i^I
zvMimVOA@XGE}AMh9QuJw8Dsb*jsR9{OUw#@Sph&JfQhbW-QGa+j`DJd?!mk4e=>lm
zI73$GXralm@2DD*&YJ%PWJjb$t+?jspDuUPmp`AK?0+*nJ3l?xJJQPvKAwhJhTLa2
zZrPPiyR3MH2S%-%bhFb4u`U1hXtiFjyihOR{b#@UtN#}todlLhPMF`VSwr{<?DNU^
z9q_NN0M29R&N{TF7d2IySqc_?;);e|IPeG&NRQVIWYFpCu`d((FD)%Z{mR0E+gLky
z_jFRJ*gN9Z^7G?gc7A^HOV7`1lvr)A5q9UX=eKJHn8N|ofmTqiUsN6t!clp9Q3(nu
z#F8`=Ok(K2Drf6Ov0nVcr&TVyYfcmV(50ld6hUB)h}Zg4&URsrUE++hv_=}p7VOrl
zD#>Of+RQQt@Ot*QKfx^|Jkc8I$pwn7!wLTr+oD+~$MBw7gErc*N6^%W*s9p)AGbbf
zrKmMG6@9?-vyszrzX=_YeP9yIiyIxGb=m%cL76i!Fqj{@ULFQ~7noknn^nd)FEf9+
zo$V3;_sueUHwTn`(9ntMin|v@J5gd_QFu2(;FN)m5zt5=mkObc2F0FkDpaF1R3ZCE
zz~ID01#$iGmV{Wqk^TLC1`XY*ZjJlj(lwnND&+=`y$Ls7;I!?|<<N?n1+NY6npxVU
zQ;>HV(8hNEBgp{M{*0Y7x<wsLnB6ywwlj!|3e&v?m%d2_5>n#U^lzkIFB~)#rESG{
zj5Pj6pNk@=?0HmJ00Tqfhm2YA2>gqUZXM7d!ktKE`YR`3N9h-U3Qa%ZC00#{;etO3
z{gd_#wIc%nbNqj@*hXO+kK_=Z3&0%yAZkQhbX#WwA4(6G+R6e`Dank}wP`(va!&fi
zFfD}8cmw$(9Ba#baoKpb`TTsrXaK{A>!dfs*1JRR?Nd-1%+|xSE9UABZkFYO*KWhm
z!wryiuP0Ry3*wD;wpcsC*sa)g`L=>bj&5gp(MeZTcR&~jZ@<ZpIT|mHB;C^{#%U*W
zx#j8g1`?=`ASHI)@A-U+{u)F<`RTp_56cbgEj8A@>U4LTapp5T%IBwbZMMP&j_64R
z>P@~j`Ovmh-9jJOidT6TAd272xNk~)aU^shy)yy*rq?6h8f51!8$+9crFIjiD^e!5
z9;}kW`3jsb7D6Mr8_$DIaOaPUN6cYmN?v;jGSp!fv0^`Sj;$$18cOvLoBtAUUU^<i
zEyO?x&N^<VO{5pgu*x@|ob8;Ho(h1`vYRhuam<Dvp`<NNXqflqR7G@c><g1V`o=2|
z3c4Xz^687S-O7$hPRXq2Hhj{@G#8W_Y6J7d-IRf1&#d}1aBvJ?D&&<ur#OI74H9?|
z5|}e537CX_m1y6=P)?Shv!`ahYNr;<N+?lQBYWz$+TBnEfKR~XtR*hZqm8c5z_pM)
z3KszeZgm6~MTfrz{%x+6>U}j{lSjth6wxQ}MBIq6ZZ&7eh$`@7aZM~MfT}8u&I&Un
zkBOTqQ<xeS5REay$HGZOFEre}H5^Xy--i}sJpw*KOqhIf#!Lj0X)D}TF1QHW41InT
z$AUJMPnsnopd9#Y&l-|K`(611eAGP@#l(0MKtx2aWGw@U=dwmvlXm4cHa>f$_*i2O
z<6i2kaK#q$X0`RX4IgmTB}axgFAGInYQ(R$Rii-3fguL5B<&Blet<<Xy+?Fg%8M#W
z%84uYelSJRW2(*&vR+inZoCG3IS=eugKf$_(}=o(|N37AHu_-{u5-5sMVll1dj1Y^
zi=GS~4~lc6))brr&9mD{WFmQLOaolS`#wzCR_+XDo*X=4bY3xIh0$JAJc%ZV*7$(O
z#SJ*_@q~l&cegrgzSL@$OyBAITKoX`I`jd>BPv;g^xG>!mPU+6`A3Nbr`_)299}rq
zxy9~>S2SEWF<W$~EU~c+4++{F5Lv$0d$sfE$?K>IrY&y<Y4$D-I@?b!bGf6}Y9=t<
z)GaQ{dfHI1%WdH~fa_1V=94*}V)V|*Ys0)|eZ^Axnuqj@1kS#z3~)pmiCjMOiytY&
z<Gpucb2#`Z(dESMJ0z&GP9~7S^5-WhgDQO3sA^|V<L$-MG`qlaVbTPTW55}rzYZV}
zXkjLiz#5Vd83PjyTY43E9dM9h{~Zkslx~)>a$GJi7amUd<~&5bwdrkC+l@E9X`$=4
zKB#86ft~g0^3i{F66-`Uu;GNB?VWzH_w2yT<~byq+j?$ELP?e!vZNpYQap8~RrA(v
zJ@f2Q@l(K^9-O~CJsuvJpUn)nS)iQdNQ@f<NcD2|c80pKBWnBbD$q&k$rx5reF8y;
z<iLxLvKAsje9kYX@Ln{2LmEd(e%6`h7<U_A9bxWfiQyDnEP*4W&(%E^^x*XL<aF!h
zRPda8o*QlgrBiOFNIiv}1yCHz+OT)=;I0e7-GdX{A-KD1fIx7E;1FDc1qlRq3+_(v
z;O?-v`@gyWJ>She_miFKs@baDXM5hA>FMgXr=RdqnFmS)m{K9r+YPo;js&<%d3{TR
z;GacIoeg256t@vYVExR-C>7=d1tEB&Y1D%=rTmdJ;JQ3cv~_G&jSt^~^i7F;pTwk_
z;|v8p66VcWGcDh#LVXeIL$NjMrD6cy={!;A0%X!72AV%5TG-UjCJxg9#OhbtF#9sN
zi|6z-oZ#$X>3r1V0Ih9QLL-nVy-7Z=2CJy!sHWgXg}C`y8^)L6Lt4GLxnC9FMr&AJ
z^r5%9er8~o2I{!TrOaAA19r`1c%zZ4k$o$f7W2W(;xFlPzL0iE{3%c|X!Pb)9vb91
z>|f48-ji+u4FM!IS1jl~^|75wL3PX#sUhjl)AG`oj=xMh^xYax8FXr9eMl559_Toe
zZ3=uQZl9NJ<}WK-uHp%GTk?9`dU@}?YWQVvj4-vQNDb22q%H)BT_gKAh=DqN1o#}E
zW|Ns#oXKoTU<7=_NbrYzCQD^&&HUBmlLMIh6$02`=UrKFH6C%vnD#qf5;Q58PQ~r~
zBi)0;3@NudQgVvGn<UhJ2xq*aA=!JG_@V;{;5+y2pD6fnC+^d8{Al#g5;?ZVJ(g_2
zRf|7@)rOS#@1&>YGWJBQZwuDH+@>3@k#whIb{3TJvc3~c7?38yCcsI2Ul2g$Zm3D$
zT^NNkkX-~ePSTCnygeOft$fwf(byNp3<vJ*s2l!RDvJCqih*w$if;7CUvV@@B|UmC
zJlAXv<6c&W;76Mz_-9BUW~F}6Fajw%s|8jirKgCqza0_<tVhof8!}dxWXaxWy|$r0
z_7Ev3!L#KflQt9)nIBh+$25y%g3UtGs@ExedKvs||5NzrOkP_$KO9F_iV^~c*;|#o
zC<^=A{e6rU1-pT95VfBD#Y*4ZF|N?d#*2qWt6`k=Miu`ttC-h`sq+_O$xTG|pnft|
zFRX#oVQdPF)Jj_A&*6RO)xE=mJr(8*+TYB&Bv_D>h$MPqhbEWgB#VnD>P-XIVGE^D
zMyryNLphux2f&zUHkIt>SGbQYuA~FLj}wsj2B9xOjHhwHFgMbR42p+~ljh!pIb@`C
z#2#zD{Qc030c4%HXyeqnbsH|Hk;MazBlAUa>lj7ww3<&@I+xUTmV7ta#p0D3E{_(l
ztf9_Aw^~u!Ce|n$S}FH|#lOfXUvE#M!5-Dao4LUXD>cxr5<V`#Fi~Eh&j_O%Svk{B
zxE+7JZ?AjDUDlOOhKH=WBL)6DeE$P{_r-~!gGJ6io=!zNzm+5qL<XtO$5-cjTIl(s
zpFgd(k!1Ne-HN``3Z?0{B)=acGhAr)40k7|b#H8Mk^SXuhr10&y7KK6ql+)((Ql)W
zdE<Kcn$k>4t|knGm)HMV3D=laO&$fVU=bM;0GQzg0MP$ZmN<O;;QRriEct5{+z^^!
z-QUH<9SH!0xdH+JFxUV9ppM_pX+e5&^@?F<4lVSjcvZsm3a;A<{panV3>060C3lj1
zxCnB#I7=cWxU%T+<}izURJzM!-b=5@pF?07vORH$eV=mN(Uo-d(6BJZO`z00GL_V@
zCC)D;rFtD1U-<cWJ+6S^IRb}lSz<o~j8qTtW;l7dpXpGXfjS6qV4js#sui~rs3dsg
zzb#E-O3ix`2V=;|RsZ~0ZA>?8{b<%O=BC}55tOuOD<MkQFI+5@{1CrX^C;~9M&OK$
zzJapfE+o$RE2FkS>~VZBHolYp6m$;n8+V6*hDh3ce+j24^Oe5C@ZD|Sqj-3oSXQ}6
ztKN@B{IbFLR>~i#GiaH#vnEp+wF&L9()`4gu@}JsWm^lf(Uca!KL%)Bj7(HwZRI2O
z6sda%s75Gc4N3$Y(j7hD6DC6uQ8{Yx@z4RaV{=CW3_3OY1cP<c?EUREBN*~u9mcU1
z2<v#afA<hiVRJgI+QZNYAsI4GeWTMg!bpWh>T;bcm6e&5$*1u37^uENgU11{V3G_P
zpn|;`FLdBDVmo{*Cyicuvn^;TBAjcMvQ0JrT{*Q(3tfte9#eY4Jg)a49u}$#<Bc+w
zP`gr11ZI!$g?i=V9qa%W`emsbL&(XNntPZqg3WQhdtzX7cO;rYDi!~<KVXj!=6U1u
zB3dj?^&y?c`gV>P$JqY5c1+;L8^Dz?L0tt1@wF>_tirt>%AvJ6T*yR%;hp#`+B?K7
z{eDLSv=!t}>76H?R&Qx`x8~;~{n~D~@5Wf3>7*o0dC$esp2#hu`ZV*-te$b<`17z`
zE!0T(Osv4|pb@DL-bgz##8#|*v!;B<0E%Z@p2n-d^^#8P)#o$#mdEg0>+p1_*|pov
zyGuI<exUEC!QS_NJZO?B2)eEr>t=65@bU(E8Z#B_pY|WiP_SG<MuRVQr(CIbQ{2zf
zB=!A#KW$MT(SRv_ocF}QXRwp6LHiF!=BJX*<*T>;Cm&s(b4*)_(d@mw(VioYH{-`L
zUb~h$Ft7ZCNIU(!hWm{P8T<TREfs1i_y5THY<RGH812ONMeiZ7%C`>2B%wL-y<xjH
z+1>h{g+rO4{sv(I1+?mfqY>xx^lP+|FCP5_@4i<$TtTH~-Z?!r!n#V<9Pl3Va0lu>
zN%Q}_J#NZPbIn-nJQWH|!3id&-SuQG!9_`HJln0HPPvzd(xUE+@^e~uTDBzK9W_&n
zS`AFyd3I*~s`haKkl)}ReE0yF^~Z^KU3Qsk^hQslbn|v<KgjfD+OZtY2|da8f~NG&
zp6Al?jW=I*#*W(*n*&FBr^@N}>bSTc`5Ome^F|gVPO{ixSp49Ppj7g4e?0_TEh=2P
zoRSwJcq4b!HjKC{Bc2~W2s(Dl+`~(F16tz<p;TFMZ3H6Tq18|V`_*~6frYX;YFvcm
z^(AqBKJ)lGo9JLh)^E*Zi0!E8WF6p{<;Rcbi#$qRg{bPJ*bTY&(w`}e5@$ai{_1xE
zg$BPXkS~1|-FjZ}frvsEgrTATI+q8A|8jKTrHt(s2>-Mi`fSJWh9A@h;>@qyP)SvM
z>XbLN4Kd9Q1U6Z#fKzf*LX1wSz;*M>5~eE!I^AFdX=8uA7$e07U@y*iosd1OaY&;P
z%7v?Vj^03GnJI2EMv$cwXQ6moYV(<2pB1tl&$a9_NG0t9k&y}T+{jS?ph;an=y>hl
zeppD$^s{k&lv<U!XS0C!s`F&*kt<sd+M2f^LObo&oyRI)f^!6=FB~^aT<mjnfH+)E
zvm?WRj?DqR$=MKQ@Z9#~mJXAG@W=ugV)$(cm9{-m%~D><v&&*nQdJcE#huaZL7_L`
zi2?V7amzbl*lL-br34hf9%I^v#J}40xk_i@?G1tyQ)K9Ux~{dvkuSQj#j_E;#%!BD
zL_0JR5uG<gRU$S}l)~{S9YJiC5L>diUM~C2mAw`=DqG#lQB=Gma4%b|9+{e6(uY_A
z$K<wWaQX4tp5)X@zc)w9Lfui{!~xFjLKB4!nU7p6yq8Pu(~qS~C^4|Z$EeOtgtd^!
zbM_&ovOKiikQEz-_uD%X@so%NwD@1BNfsFqa&zU8Mg;Ku6Ulf`PlcVbfId6(%^Ip-
zolVmiH50uL;>5mwu1Wks-p%dQjR?-_^Xe11<7NpVWIbux0%5gykMVu1|H-jfVB#^k
z9a9d&?KDzPP<>oQ^W~NOBIN-1=#_fJ5*+hExp0agCu&Jcqun{*Pphc3ILe06x$iAS
z9iI{fgkir_+g#(p$KN#VthDZ`BiYw}SxTw2v}8vaoy<1cFIJ6u_mqR?5)T^D9?10y
zi9?o!SE0$=o}m7OxO(LDD`zLOTK}CG94Dt2DS}w`>CYVT{^;&=04LvlEY#33leVSf
zh!^nD2Qi;3rJm3-*Etl5Hz5etEKQ^;Q`3E`<r#I4QoL$<cND1HZ&D&`6r1hws!s8>
zl64Q1QrRaz$wNc}zW_hpKpnweR51fM(W@lv7QNE8$X*jPImuh+R-sGV0FlluDC(0`
zEY&miqegGsNTgrckYU_14tde?VhJXj)_M(WxQkZoL-k~=>u~C3d@#gOpTbrm&Z18F
zBPD`SIF>qAJm{!~YH{)bfz{+SgRHAZy{U(HM}JUq{x9g0GqCD}zV6wxBEk@+07!Q=
z{>*pgLrJYQ9Z`SMT6CfAdhzh$9*q0z?DG>HM2XD0mOi0!Gq(DR2z^kJeocwPr|t5?
zrYMi2N+T>TT3N?XAMQc-LypZspPm9rL~E5~AH2zEVNncPymSSl;^1S{AEQTP`y3Vn
z7O`xBDV1IWNUE-A&3u*|OP|VxU&xs7nU9li;s!4GFl2})T(7t92U%k52vETEnti%f
z0S+CHFTy-3Z0apQjZO@mE`aVBle^|%U+SmUd|G`sdaifgM<_+IW?Z=yYj+Emi3Ks$
zZ20^ksh5pR`SguUccIqkD*4x80V+3lO{1s<Y10Kf4i=c4r#ckp`W*D)<tn}u7D!6l
ztP?TkbcNnC>C>zVQI%GauB-bi7H<3JWk19Z?dq+X30ofw*syT|tWx0Cu!M-4mKxX`
zx|5LFd-{XEa=o=BKnoz01(6LzeT>BSq&!cZ?M8h_-dKC@sTe$|rfG25cGn5G{Us5^
zENBc_y<fWPG_1q5HZ9dTe;5s!=!C%C9M|`f(U3nh5w%P#10G~PdERdBTbLMgE{vcg
zJTfT@B4O?>(Q+BA*iyqE`yP8FyKxaiE2rzIFTzgd1Sk^bq`j&%HYc4r2ZSZ#Y?H9z
zV1ug}1$V70G{#xwIs=pX$n!r7ZxHlJ#6@O98^<~)CEhTuJ)kb)rRLib^s5U|cDZTb
zOIGC)a+6N?&O^ZxA#S)=#>e%xVP(4Dt`-Wi)nG6jf@M*|vC+sGnPAgCc3)O)($W}z
z0U(yCDA>i9v+6*H<$R_ZSQUuqvr+mwhuQ+MBIS{lgl?LZ;@+So0Ogy+Wz{0~S10+0
zP0{%SYDN6ivp4j%Xe%^yUqU^4F-NFUTZr|Clu?USgI-gQbx%>;v!aqIfDu*|@(=+o
zp^Fx3GctoRZ&>TGD#tpW+Y4*$(S0J^GLk(=oF-|H7pw(HzUzc>ph+mCl}|4o5%F~i
z8PFJY=!p3QtY2DD?&*5<Y^=j%OtkX}Yna-|aNFq3q82D1cLCvEtqTQO&A4P1+=xZA
zBz>Y_pG*Zv)?(z5r+_DG-G#U9tOiN;$Pi55qQi&Az~4wmg=UZlRSFXNsF07NYqwM&
z2wh36MzO07sB2Z5sO#2R$LP-M1d}R5>j0(KruX|owqvp}5_Ky{8?Xdv*<LZENL^?+
zV=<J&GGmw0${xPw{=QI+OPd(h!|&)77&=7RDGDVivGVQIhQX7L+S4pvR6G3>6eQoF
zJfJ5{A3`I_t6Kx&;!(w82m*N$YJ%1!B7%<f67=m5KdIKjVCP~?{iuIkuiVIK4RsRB
zyOBywy8kRT!v6q&ToLSjTF)3=Q{L=g%=9)Vx61N7ASj(Ntmo`Odu_Tc4wz>I@78nv
zc6htbaW%7z`@D^FjCdQJGQzSxrb7+hwF%$P?Rtf9>j6XZuyx#x=de}${#g6P$ERii
zhwn7|%#_!n87KYqF@qJ11Pislm4=|9E_4`@F@TC6HGvWNhIgeM=+Xa9ADaBDV!OBC
z!};Aj9@-*S8ISmEehCxefmH=XM&u$EW~HX7*9i643z2V*Cjf8xh2rV)aX99*T&u$U
z!}*hXR(s|uF5GJ@TiqhBY0L?`)2fSujNE$k<82b%qA)=rqki?z8=nv`FU`|UMT&8w
z3o;nn=juagp`LSCK-XXToo;+1<q|P>jmF+ijBnD%CidF%#U9gtQlm1&p<M1~8w;&_
z!=$$;g&jaj`U@ax???brv2)i8SvMG&f1#K)p`8I=ahL9~G?5nF+S_4xnj&3oaI&_z
z_o*$H&M;OW7Fi-ReI3du?l(P9cu%i-1S|NXNMQhQl82)^XeHQL-v@0!sz)wP5t}hr
zbEK86l$AiZ*i`qP-EgoeZ8iW!KI<)6Wf6bYQb)2ZNlGY{lZ5V#D`VtT>zk>d4t#~G
z@^~EqAI0Z_iZKoy8+~xaWlz!KS+Zq{8$WjHj=8)XVS^*(_d%5K=ArsC)#q{(bS<uO
zCK)K+Md1agzT_i0)>2_AXAjLb;mxD{w=CT9(j!}vl>Q8?F7sGU<RhG1j`fgbkKPgM
zATO|Pb@0(^;3t7nJdhMo&&U}tq3-Etx8~R#tJJB75Vq$D&dC;`$!tyDlds(h8U<9d
z<jrlm*`1PvSveDEbC>s1yy+V->8vo{phVYUtf<9~gsljmEXsBLr5;|Z2$%bzM<1y)
z;-H&64rl0{+;URM<hvTp4IR3yWMXo|6II8-5W_mAeJN^)xP;Ug#Hsi#e7ws_J8W^n
zpK4a5%Rbd*bB{tCuDV0IYHd-}R7Xv{Q?z8Tt~H=Gx@i-ex!_nHh3u2OW@I^$bK=5v
zY5RJTRlVu{LB@wh3f$4}elywcl|lr#cNTXTL-IL^lJCO=-lxA%zvn{M(1-?UM#eR)
zy%Yxw<CdFmwc8P}lO$n8<q5F?yS1vbAJ|oGc%-Bn_kdnLE2GCwhwj$c9xvr#vQwAH
zv4(j5*eVFLr^s{~8~ypheVuIT*CplK-?g>#7{TZG%53+CqO@f27rzA3h7ym6?d?Rw
z?gly|fZKY{x57&{EX8)1lWH<W4E0IPYy2OK+mB{Gh6Zm#eKWrqWEz0I5fJKG7PpSz
zt>kD3nI4zUs3+EL&dPr1j*r-BgWC;oh}~3u<4_CJcF)lyR;zCsy&0w&Lw_*~4>!{`
zQz8Dv|BH|od9Rr{L5;W?%=s1!h6nM?p=2lqj8%=uIFnWj+|&qnFnHnAP&YLk>lx>P
zm<MZAF=mZpc1p{^SAB3TyZ<b^WTSnBHHBj?P;e5HAG@PXD_ioZ4)**MXwf~|BpMWr
zPnjC=bU+vNoS$ERdN+$JXt6Dp?DzN+M>Rge!=gx26MVy1-+xA8*f6+r!!k#>fyA?W
z$ta`d6Zip7OxF3n!i>~!go4OuE|{y_QEhza{i~Ru_HDWbkLy@26F_@=6^t%o6;+=G
zD8*q9ced-j4HS}$OK0=$nmRPw&y>2#VwmQ7rS=Y}>#i950W)ymyz=MvbL)?DOt~<9
z?p7-Ss?3%=DclrI<qQM_mic;MjO1b@q%mnsyLzg)2fDxr@~-2~0Na3Ot#jz^eJq_I
z<!{Hdg>cl*(UEdB#l>c$dV{aIEM!DpQPP)S@1+h>_ZCRm-lb_oXasX1Xw29_*O3K~
z0id*QS4pbA1B0G&LnLQ2jjOv}W7n>_3R!b)9f`4}hv4n4M7I12$1}DOGGyI`Ax|-3
zr!<k0Y(W~=%fbT<yGszb7<6x}yYRd8EZIe5`__>u>HB{WWI*>VEJisyp|Nl9QK@!F
zm!;Tpvqf~gkxcSE$bwB<U0uI8S);j?hpxPx?x1F>V=a3!gz=km8;b$TwJjf&aKo~g
zO@ce$pc|VsG7albKwJmVD@N@-uL}h9CtLI;Mpc&#6!UWDYBu&SroQrqP!3m70W4nQ
z8OZCjM4aeywiuTR$Q6`d82)QislfDETI|PbyE<5i2p0nYK>KS{$;sLh;>rFm|1AO_
zQ}cKE+n`dJrhLpI7kcZVCi<X+4$lgSCoS^rIJtb4a2i1VBO(+FlQLHgW>LHv*Yk~!
zSRA%F{~&G|bF$3O*>vvi38(9g9oP~ViW%c>!4|)(4LnHmA>eiW-M&wO)om*pMba-W
zs`B$2h6muV1_|~F<KfndgOd(k@55#bpTlmD@;jYtiBzmtgV97g9G9;Uv}qDq8$3S*
zcZSmfI4kvPC2!u*^M@_4%}v1x)#Du(rdv-DN3Cr+FZq@ohMFu8DG2!)%r=GoY<%Cy
z7nVy)I8mJ?6qac_<f`UUM~6=`PknXAE+E;65s)Zi)VioWQP-?rXL5o0WK=}Y9&VLG
zyVBxw@o;=hwMaenp%!zF_8?k!c*SE}w)N$+L!fo`DN#c;YFPFDw=s!P99s~AXkdzt
zG{8!mK%@?5v)DKavWTFdr^`m9(oeYCveaGx&(Vv`7ZNx-2{B90as|s*$zA2_rb`F~
zwWhzUp$Y8g^VZPF9yRcDY%nX+WQIUQd3G@5>xi4(<AZ7ZML0*F=_P{1d`=J0w&q-D
z#$aSzy*&+mCD~nsM-?21Gl{@E*Vm-Wj!UZNABg-IpTYdC?MJ-Y()_^eOY#v<9*IjD
zv@n+DOtr})S1Kw&Q@AI6w`^ieIWR*>?NvtWRIAmkHLM<BH~cPlpYA+uR_fHX_nIPG
znU=|bUmM9!&HE}59_|Y>=d~9gZ;=t$Vap<Fchz$EORixjQ1|+53M5yS^a=QSC_|A_
zwoo#a>Fi^$*+9G0M$yYx31qEjMC)7y$dXVcbav7gJzru4JPT>QnLb$&dyEE%%$^A}
zNz?rby^2uIai<r?h(F@l>n`xj5V_jgbzy>I)jrE|w?s8P`Jz8m=b!uVke(hY#klzJ
zK!35R&{=>oIx^(|MUHRW`WDpsVT%C!#&nmmj+e%ZbVDX1@d$Q97)n(2@?Ir2#)y<W
zgH;?W^<W=Z=Eza=RN4x|n=J*4Gwkm@o&Zn+g0xxF1Bg~=CZeQxMGoZO4wXm3xA?KW
zvE(agQ6wyIJHI)jJ>2xrGRPsW&5iv9E|+iG`;v;~6;h$PDo&j_&CD3L$_)%6yvlk=
zDCGXZk%w^JERo1ttQrIKMoMWzq4MHKU;OyCaAVVp-<4QTc>h*sfjB6yKtNOOoE=wz
zlcs-Hs4ir*i=sH8uvhM+5Xgw+N+%Ly>qJ(+ED6sN<LR0<7rsfs$G|+sJX$^xiy^BG
zx9sFpSp35$TlIMt^_;Bkp5{9C`*`N!+htOzz9)&T)xtoUi)iZLL4(3)i`H?!!O~F<
zG~p_O*2n7d?4lWp2>PXYpy8f*g_5k?38&d;UM;!ii9+C9W$Ulym%bg^^ap6z1Tn>e
zrl9JI9tR<xq)eslcxQDN89ADJGAHiLr@cC#``fAS?_uM=`6_{;_R0@ieLV>TsZRN+
z+yvQoCY@ek^yIix*J^@y`Vz*O54VV2?S3_f&lr)V3-G<(*!%JRB9|ScC-Q}k`tt;<
z#K9v2=Q~#SHF$;MYmg~2e(I%kunIecejW1?3Wu#2Okid&gFRr?h3$h#LZ42tWK;jz
zelM|QyAM<5%qI2H$#aZ@@fsP;2L6{`U*?0IW=6FSmFKM+-cc(e{WVRg_WTUR`+Ut7
zY||w2E1K_FiV;k;^nRWMI78ody^|bLijX6e@=?N5OjdPR%urcGve2Jl=YC=k(cE=b
zP?<OLK0XI`#SgpnP;WTkH@ua9)GGD<v@kItfQr=w{bo;>{2{tSEj1CP3()F&5Cm3E
zhyM(~*BF<a_?(n*6z;yTwt1@4cfR#5TA`KNH#L*HqlTA{`K9n8X&Y6E8d}{oUGS=J
ztYxwK7FMQ3+xDB#?Ba6oRn}7Dh&_)-vi`i@(q+m`?~b8y#NN)K>nr`H;vpmo7*jrP
z-U+kZ3Hykv;s|uNScYfd@75m?k`Vv^mfxt$m&|&A2q62v%YU{0uI3+|Ao}yaQVQ{Y
zBf3k5^M6k%7#jZbbB2bb0;Dvo%xtX8Jj`q~h)QxmC=%%3SqB6FZB#A9=K?(^4|(?i
zQqUo?_1`Z@UIOqf;P;+`2Zs%dp`o?CwX>n&KYkJx09{%x^!J<1pHx{XI{AmN&?r*~
zOu!ld@dExyP5FyzYHMw7@9bp$5mM2=$RM$#zaPk-<lky|6lE08T_{LQUI758|3;RE
zgy|pkHT_`cVC?)CQ$zTFW&VzV>V00&6f^)Zi4Fi@{2S8^^iQURHRP6S?fI{&{vSd3
zPmBGZK}aI~C-xtz#J@Z3AtC-hD*d18#XsqS%zx7V6fr``zeS9H>nZ#h;{Qx#{YjP%
z{wMk05?c`Jf0NYullq@snLnvnkaOZ6f83uP8wm8j>D2sx=>K$u{0Ys-{3rC!>jeV+
beZBn0@x#LXKG|PEHY~_#@C^AX0Pz0+FY9IN

diff --git a/python/lib/py4j-0.10.9-src.zip b/python/lib/py4j-0.10.9-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2c498361470305cd2f2cdecec81c2aa21a4a6f3d
GIT binary patch
literal 41587
zcmaf(V{<M5vt?u3c5-6dwr$(CZ5vN)CnvUT+b6a??@ZMlR87_W1O2Uc*Xp$uWkA8u
zfPjFYfV_#F<Ub=L;!LoBfcUI{fY5;OfE>M8tr<*gt<3CQUCf-_&7A2Sz0_2pfj~WJ
z;&r7k<n`S=VS#|bo`HdY{@0XiJ{y-U@#MW{8uN2RU=J(x?v2__<Fq>IKMV2hW)tKS
z_a{wMP-GD(BuM)p1!npeKcCMg=8O!AskRokBAVzCVCLTDA3J<}#h8aDga*-GE#p&L
zQJw+cA18Zv69a~fo;q~wVQANStI@tK^vBZacBIUW;~6qqLx`RlQ*d3;YAuOjnG-KU
zlMh6w0dxH3hY{WO99qP?=8s+7sq@b}tv%J!Ng1^@Yb~JxV~~*L<Uo_S;Y^2hS{?Io
zr&?6U<?njxyV6Qi{|s<9_IV4yzqB>XNO1$97)vah(s=i^;zlm3!sD^!{m353;S6Lh
zXbl_6W-ip}8;Y~9KUG=?K;q^-K_CP1p_?gT&eUqj+D0Qtt<C(SBf16g`U{+UmHHdQ
zf_TNd3Giqyt;rW%JFGZbWL1Xk)!K|z{y|aDIrqz`vyvvwWLE)6Qne}aovJB~d*uvG
zx~+lJUkQFwR&D7NMEz{2KKg7puf-dd(b3wH)tz_TEc~MZt*LG9$TfTP`Q8^Ye{IpN
z>ZoZ8csKikH6-ep(SA>~W7|I#>_0Wg(QQ5NuD}t*__e%Yzb^8olvpkZ`U5^Ku}XR$
zPK<}ib%*g7<UMSc5Cq1VLZcOs`rK6mVGLS^v$W)Y%x(_k9au$+9pJ#O)q@q4a(-F-
zYO!{{FfnoRbW!DVcXRlV-Ie`EKtCYB3DDcs{Z>!-?zZ;%=J|2M-NWz4)8mOqHy0mY
zzGsk|C(H1x>_l<C9SdN$Ytfg<>@NbrJE&T+UD}NUBWebiwK1!e$gjCQTk%{e)-}S-
zZNcC~)g0Vns7>=kkleNgynp_!qv_}H=Yx!@z3_Q1(F#&5ft{H4iz%sAr$LHvg=g@@
zenTVkP{}wSt67%}cL940e?NVM(-7YehlGS2k8Z+<$o?}n{}~G)2hAp9$WdbHC(zjI
z>*Mcua|HDA^9gYCeBYj%eH<J<5H868F-yX~mQal+jq-9x*3XtLd4vX7e!>{e#MD>~
zUv^JNe+8I+17pB|veYB~f&7~rrW;K8&uq8MYH{3{%h)LRRoL(DJd*%KH=Xs-x)~yJ
z7mD<P2@khn$iBi`Wswr&EuyPCal-97HY&e$w-fQqXmxvtwUaAuZ3RdCZY!wws4Sug
zvy|BNYZ{2}bpTh3m3fz+_&%Se61*iEFPR~nQ6^XDZ^DY%)uT7tC-_f>=qD4zG8E}O
z&fgN(HH38GIvpo%`r|*wLr^e6J#jh0lWvPsLLx)_72(_?$zWCD$na>N5!|3ob>Oe0
zUC@KO=3Hl1^Ww&K9~Bvf>;2j_g>36EiF%=+eCh9W>)?zs95panVLP!7PlCip+%~54
zvly_92{T3o+lttD+&J=qtCfpp^f4pEg|5f_#cd@+h)FVPE}4|mRciR5U?{6rN@hw(
zaw>)q>i*V<bJZwY85=c;=hAgm-Z{bXK|g1P>mu8;CJl<-DkVlO3;h__fmA{upZo8Z
z9Wq-N`svF%$=pq|A7i0srT3h~GIjnyjf%FX9#4tynYDc{jLa6p@}~(S=gPL#GaQop
zWkND=uDzYyJmy#1K$$PI=e{g`DUY8|b=I3GZxI--A)qvEQgc=<K~5T4JRV(t8bp=o
zB3RYn$*jURxw1f!AIYihrTWtFUpZm8kZ5CV6?GCo$%qx0!q=@l(fHV<a|`8<nYYl6
zi@x9M_xxV<IMv5$P?w!KhDkWagBXcu5R+z3lS?n^>e$;nLa8k9uIqSJPk?%I>Et^r
zsy=75k?zFF7(Z*BF0f1yWNE6J)}ymxNS+XrvMaJHeiJ4AM3!-^5+Sf^fF;vR>gnKV
zFcI2kaKO_>x1zgp{cNFcD1j=|N7v9LWgj#&HvHBoGPOuyOp!algi{vdOwvMg8>{}w
zfw7iY595^}@@;Lb7S;0L4emoHA3^5|=E77=TbnX0zHCON#q?G-csu<WQgu!$$Fa89
z0l=_;*6{U3Y!F?h35tZ(u}Lv;-C}Or+h!)<;T|}}8na%36R7F=fthDKH&qF3OlLu{
zaLutdeCUQ@N!p2tRibzQj4ND%L@o@XpkZNH=qrc>ADEP>Gyx~E3beV=TbV^$zoiwW
z67F&jfdaEl>D%R~*Nr~P#R=`OS+2G*-ggs2v>JsBP*<yqxci~AOhRcS*lUx{D}@+X
zCUELbJ%r!Y*SNaIrB@b+9Y)ri2+44FSA<!m30<`a>0QV}H*Q$1B4zPmM2efyEK8%|
zqaL$yL3oYN)_IR*OpA~N%|n?Sb&$!^wFp;9V@@Msi%tYvP^api(Nn{W%JUUD^R$)4
zilVnYHU_7_0gLqOj{lwn&%`~P`-Qvy;|Nbg(t4}gM1xiRBIMeIG|z;MX6jF&Lk7gZ
z+bMd}t`hL#{JbWf(D6q=6WcdE0Mkxo?r<KDgHPp8RY;d<ax$mVE1KqS;zWWsHB%{d
z#$0RWPHfrsqNCrVH^5h@#nR}tY!0KvUn?O||BCp=$n%=<EW5B~z=MgCQ|{Ytcsugg
zl&2DeyT^6ur_>@*qqQyIPg_OEn6OX+c@-3hgxyUdfQ3FU(;G`}9LmQ}C&G$-r%g-0
zsDXN7Xhf^|`>ZABlI%>Z?6B$fx)ubrCgwk+x5PP9cuf@PG-aCrkGDiX()q%UKRj8-
zSE9}Lu%fcqjxr;0_)QX$O&3{5m<f3X(OwL<T(zmFn0@G?3ejg?)T<Re<Vo6DUV{fU
z9J%0h6gn=(aRys1KAUa4o7C^RA4{Z*dGNV!Y1}?9e&RnRk<ydG`sfjfFH|ndAwg6*
z=nL-5?PHn0_OT^lvjm~%Jd;QIHzUn}SsE}4!!pYwP=-+AN4jM;+nSL{!yXkeN^T*C
z74mf3%i*;;pvVT02;q&-aEoY=mR<!qA!h?JT_xbk`vTICZuW4#urB1U&Vu_xdWwiU
zpbJq;{A>onjg=IVQYHeSN%2%aa!<vvnZ03!Qsmus?5E2y#SAg2-^$ekaWC4Y{%YY}
z)QshauFReUZ_2fLU`)#fZ6oM{ZkEpm&PM~W&?lww8MUz>Ac*S*OLBNHNesTsDfAFc
z>YzZ}unX%+spg<^yEeHTtgDC_DL4U;rBC4kmN=dI9)Ngt6&n<+>vMX<qUIjNxk9FY
zDqdoWfJZVsB;#-~JWGmF5jPF4yOlL|sG)b3XwaMCwXt%YGu20u9OwQV4J#)6ZXPvv
zet0<4r>UtBwPyE5bD8YgNO<9awQWn*gH=N0OrW<g;*=OVigH@?;vxG2;}HIn6nQF}
zR%D>#UU}2#DS<A|XXXvOB=A5F7h%~oe?=m%OKOAz`ZA)xL|qEdsSylTS)uS=Bo;yP
zcnL-98C{mbGd)q?-hY7?F0y@xhdj53fF^0K*@{zxBFQTZ6AV6Mt)Q{rKA_Y&q{5J$
z0B-$`%v>_#&Wfo2$~uUw>&i=ePo||N$<J2kO8N#CKg@t{8$qL~+mXXaItkvl3UU{!
zZJknyq}gb#5e_^_O&sLV{B>4&6Pjt=1QOTu$nJ6JS&;48o3fSf2+dxAG_*FW)ceO~
zcWZb!qdogr6L@=4G(RnjD3rwF$YQc!zs6iw@*3|>Id3HKRxQTUi}_pPIi`6=(C0Cy
zIi^;oA9~~i<<#M=Ok{dX=LD3*nX{+Y?>sRYjS?wo3loy~Kv73NQ?RZ^&WgS;VTf}x
z#X8nR^Vl?zz9_G04i2aAHT`aPhXWOjzwgR(n%T@M2A|#FBgPln)*^PpHnZbZf)Zc(
zV#Fov7)bXFu#xU6@DT;x?MLO|0#3FF^Dl9Xxk_(9KAz~jn|qx`p$9iY38&sLIK|8k
z!hMbwEjK^2;Dje^L*)x?&v3SL?y*T3H}#7$pJ|@o@*>vC@Q#EUeTrl|&~s&B9D7MB
zNy21i9ur9CUU(+$T8^9xZLBSf|NMdk`1<q<&i4GVe+)z{5V@Pa9t<|8Vy~(IQpxpH
zxkrMZ>Hd=!`(oSGoT%+{)f8%vyX)=>e#M=Pe+q`3#iq!XhZ>{bn}+4ldQ|h|MRpoz
zACwImeB2K5{VwkHIE)UF*(9Pb1;Mhdsk^^S@IV1Eh`_~Nlk9<R(!MQggJQ{*k6<#*
z9W;?X#eUDkdnJjrs`=#PhIr3ifS=bmNWD+U7IhOOvr!D8yM)(H@Lfd!Y6xa<jf-uw
z4;#QS9?JVp;MuNM6v5W77j^`XRxGWnI~spn0n^I#m7$5%pjZ|Yo4Wsr&Ld<bPnZQF
zh+i8e0z)`U+P$PRVIud_nz2hC$3O*Q0ewasRX8S$I^2$E!pM{74Un2-9+o=X|IzJ2
zSK`%faYAgVbKVr2A)0Rn4owr7Rn2Y1FjECzP}*AM3cY1k77Kkhd<;~<779-MBBC$6
zEDmPD*S5*^Oh-mk{70MZMqlnbxYCD|u}>19n)7I<mxOz6Y39=`u=x9zEk~@y7rK|z
z5uzOZDvw`wX<j@Av7cJI+KM9n&Fk$K_;sJOJ%G55vszKw)x(<llW0~~qeCM`(6x!J
zHKxjiI<9dk#sodNI>T?faw5%{gr4psM5f-)lXTYuGiG>%)wRhl!F2h^Gj<TrTV2em
zWy-G&LwtOo8Hq2yM;eurkG11*5REFO>7nhAc3pH!`iHKCK7?~Wti9hb9ETK?a_$f~
z_*^V6;4epImul`ge8pgq?v3C2tqJ$iWA5rYZHO2TsFPXOkGT`j*O!GsGF*?Y@(LA1
zXjLooF}uw9j`95?y@cQl-K?9#VUL_;SG7hrLKmzz9ja)GX}z<O)S|tIB<tkNu{%Ml
z3zynqSb-B8%J`cdLUaO(#=%PrW*3Q}3S6vKYLa5wY#G+>Aj&0{g-o)>wVeb%SJ|I(
zoWIONzVfNHHqNh59Uc&ZKEo@#%fob-hdm7VCrZ*O7^N3Y!W#FlNZ(n+A4kyaZV1K}
zs@-)}Bv9^SZ>+O~v7;h6YeieYldPJad~l54Ib+K|+=UUpmKh(a7eU`)$ek+8I^TUP
z+a?*R;tyFw5k0MW?ICNUFT>{<p4Gc%gC5x-p=?6sIL{F6VSKxYjh`{yw%RovCHmIP
z3>I1;TDGp|o!iHXw=dXcHd@|l6ep$)Uo19#Le5L%WyS4!*581`ujr3Q8f~_SXw*Lg
zBbN&e)?689Wfg+a-$0ObsXG=K>JHnAn#EO3*F8WzLD(i4_2tHNW4H+hBnt%dAtB_o
zLdWJ_No_iq4fNw&vGOu~FBBnR?0oodC8q7mr?EqE0QOAbXb#=LDnMAJ`D&uk+<3vN
zyZWALErq^8yjRaLTptNIp<heF_mibMdXo6lwp|v6*OD~tzq>}u9wkbcKj|EsvLT{)
z&e#&MgsDV?n?(fV;mM048i_qZCI=Zj+?(-$%=d+_A`yz8B7d4k4u0=E06?L>MS{7v
zlk-Qy+s%)Q&-;=faVo><#SsSguo9P#@g`KI>sU$hzVVv5R8PdJN#}YD(XCazPU<VR
zU?rdgaVL97C5RjAg;@KPBbn@s<so&55kgcZ9^N@^XNnx~g16i(RFX2**}jHswLE0K
z%xw$M&iP;llqzGl0bII~)k`Y42SAwk4PM`rH4(1VRV@w`22vfK5I(*NxgqNiNdZ_d
zQqgKc2OqRFc>Z^nZNHy$4{I8CbB40zAtb)i7R_T4xAlDr2@JcveB1(3a|yfh5>7^E
z`GHHjVsh&2h>K(kPHi%T|Ktseb_=!mfAWgri?e{G&$R{>L8#L2Q(8jL`1zw{F8AFa
zKpGWKJK?m}Y1a5N9?T$C*~d(fCvVPZ%M^WR2?I8)jJYSaEMB4T8fi9tre#$wCFO5B
zkgERe7ty#3&PGLld6z@0{k7=dB;;|5%*!Y7{5K6*;6u^XL=miYEF-n`4G~XGT{hsl
zLc0Pgw8x%dPJSa4x0|QR&wFU(jsoN4;U~mAPaH+iS`4NBd?1yI&{G{|MDPU$vD#Tq
z(m1~7fQQ6i(5BEdBfgVUe0d&GOo&fKC^D`@*AHd8c{0@pZ@fCR=GL`x1nZ=HK7B^m
zDIUk2r80Ah%j?hrZrhQ=?8`UgNo<j`WopRnoRAHZD8vb!p!z93u4>B%m6rD2?@BDG
zuRh7?S`_3q6Vw)hBD&DI*=J6w=TOrVq>hhy{u)($VxS&C;QcBG*Xyv}>~&x+SAmM*
zP*!l&5^aZK0lB-_!)+qpmm3#-8IuC*wETe2@;5^ae1xcf8PA5{J{dduKKLFEFQa5E
zPk%s577Xi;N!k-x__s5%iI*PH&eRBhv++b}TUwcU6fcNx#6yji9J;keOLTanw<F^N
z*JSvA66=#WMUmO$i0C*edE@-#WW>zd37{`Gr{GQn8y<)XO=oD~R$HxVn<l*Ll$1fV
zBOs^W`@z-SF;u#b460fQoDe+89l@v{d0S~)pfsjd+l-y;VTt)SDnM?FqK4v5)0^tp
zddHshZh27ky;rTr7XL!*E4-`MQ#q8I6ym{+d6-8EBIM1Cgl~A+@{C=)ViU^d#;G?1
z3G}5xFbd*3Diqp}tH(dU3(%XVuZQ20h2TRmSScHRzf?UjCsh4|tMr#mKy{}$u4;q9
zs&_p>B~uNd$k9?mC$4;_?Wnk!#m#t{o%epjy;@EKY1<GSu(1o_b=nny0=SWU(w6u{
zRl&VM>asJ8I9$Kj#nb!wgM^Uo?#|BF>%S;`^mc7W&=*O?^d6f;a<rtl;J_K?M6^pt
z2pUjEUw;$R!aq8kpv-W!i@2NdT6!IL(~9_W8umui*EpMAe17<?#mM;)i-6x5Z1Mdw
z2+K_&5*E@K8=?T@s-ekdtZveeT7~)TMNy=6c^mU@j>)bsjL+rDe7RjuKB2r^K$h;c
z<k7pa0z|~SSG`gvy}881(%x6-^MjtIwc)ApdYi?Kw+*lyyp#KHzl-=Iy?q#D8UCT-
zv(6kD_*!4Ojlr{CNWFl5!#O1^=V_ruG(XG5Ex5!I0&0XKxfL#2{4S(mTj~?)wxc>;
zl;(KEQMZd)^qK`H9=6A94%q`sJ}i-XvXpBphkUYt6=M!LD?~OiPhay15=GCMykI+p
zcVHHJeXswM5p5waYmAM@(wN&=wsWZ0Fbt3Qj3Hf1f1WSP^Nc1E>@)FrS9ZN#am5<{
z*U+@tyYD3+6~XB<<bA*UNd@rc6>f17(tynfXzf%*=&<8r0x!Pt3AzvP121FU+-RvE
zjfmuKv}RIoF(#HdiL;8x=Dl#pch1?f;E6MsH~K=EJ<M=J%ikyS2ZX$l<)1v_4f&Vf
z$#8`#xIoxt)I)Bw;0;(1J!v8A>+6tj!hvsb;v8v9m;l15v-$KKG>+kdxhl-W+pY+3
zx$D*;)-Jx7w^~|MN&?B7SFulo)_Im%+xPnXXcHzRFqqi%Nv6<Ofw!Gl-8Pz$w%C4P
zrtUZf!re<X%Rx5ESECsj)^b*N^pz*^&*ddX(-tGW#!1q<kOG>rG{2}^pT2l}QyiA1
zs+Uk3jg)@Pm*cu=u17RrKg`qATeb?4m2O+_oTR@Zc@RmfPpZQjwFO(XwpR0mzoVR9
z9wlciGQX07M*6SBvz1GI3TJxm*9S9h8Usu%qxj1A63Ov1;@;C4*cZ?*h>N)iugUvF
zMjzuB^bW+6d%lARdpfWy>Cbr%@!-iH#5In{Db`(QuO%LA=yeO4@VXLm37n?T`x#p{
zCbbWyhIO7Y(8)bu3emDq1*5mL@ZYTaCC3!<mj61p_!kv0%_)uvSpHgg`yb}(R8ciO
z-QLMVYylGi`E0eocTO_^EmVkDL4P~r@|xW25JLgs(_zci;0)JkKUuRJm8~pNDf<rQ
z(`HtP%zXU&y+P{kFkYz=Q_IQDl`MyIUD}Bg$pA%W{QU{=TPSl8Ui`fT`+yQqI8N(q
zMb8*%D|0*`=Q}PVbrD?Lo=6f!K!7U_Nbkp$n>p_WDJRWY&~g~-$|B<JMV|F3wz{wR
z(DZxEs1G4I{mwfUhB;c>O2r%An=*c-i@6p%5Z!YrM<{T8{nXZ>y8-{>n>3*xQ9+{f
z%`b#EZ=_e{0q&Ey3VV-hf)8ox=&|qM{S!EAw*F!F+5xA+ya7Ev3kwHuzA-kSy7V*o
z;E`t}%s7i@ek3m)$mcI8uY7Dnr<JbAkwXj-I52#D%<)MB@xNZKiy`L|>@*b=P&c0>
z;MHyN**{DIuP%{7Z0bzf@m9NB%@iErzXgj5$dbnEv;KDPux;>7cy{d&VG!B;W4<je
zE|~_2M9=n&*v3l;yb%5GPB(^UDC8GZ@&6GP@JjIBryxK;*w8>g2>)AHIM_KFx&Ak*
zqD5WCev1)l=MoLnj1;mCW1@`lLQJtl4~<g0z$YkKoJl1rRf;-^eiin~7(e+#yDkN-
z0u$r?-%P~NpM-j%6fR6e9!uH#8$YCQqdO&aHWuC!at?g7xN*1)VG&|<)?iFL7s>m6
z04Fe2wuLD09=A&(i2Wf(=J)}-{Snc6W4K~VNEeFETaEuO3Z#}gy-cAgxRf$$nNlWM
zf3-HX396!4rlP%S&9aVRWxAOO<9#gAS(fe@4hxH|Te+XCi{kiIR(PTS9uk;B1uH(;
zK~Z8u8^S$aQ9t-}P`h!0>mZ_I7i=Iu7lxpk8uugzK_g+9D-f{Pm0XVDtXjsq^$BSM
zn^IR<TVwN+?8q?aBC>X0APv&Pcz128>qAcfEgaQSMcm;m?-s43-)9t4O*#{Ih%D9W
z&N|kRxieqIZVCpC#H7WReQHE3FKB1)`7YAA8)<u)4~MKNRuX$8t3!KT9*hISTL6jN
zffc<+bVhUBvX*=d_C#3JVoR4Q*Fd?c&IYzK_kH9+mrKxE%%@KXJokQY)w$b_51Bhp
zMAvtukAL8Dm_|kC@uq0;-@TG+sWtVE)II+62e-K}>&gU+VeTpu<x{9yv`U3#$-Q!v
z+=VMGabkc<Aj`%SV%97tBFAruiiB^mvO)BnKv+(%ax=m%2#kQ(%O!=Ja_{nXAJ`e2
zKOP&u<vZungYM^8*2~WAdytiR#kjxbJR5&OqonW-6E7^slFqz~14$PD;(FmpR&*(C
zgyB8&@^R~PAM;bMZ2bmOZuC`#s-Ze_?ZBLlKq<-Z8R}B5E`o^z_$lTm!A@GK@9NQ)
zo}%Pte-oSyDA!Q*l|X)0(SRff^7gPu@_A?SFitYrq9%q;I|aMm+;hug&YmHm1EN7`
zKQdcY4kc_hBGOt@yfES-IU=pCUYtQgmqzpt1Fi6~Ya;TzTpV?_M{GDyT4&VQq=F5C
zxW!_KLXWKB_fex_AJh-f|GO?KAH|_dpg=&2NI*cy|FtgWR`y1=R^I=;+|$^J*%C(z
zSTkVKCA7xWDIuRH;XKou5+br9?%_{6)R7^zajDj8?20}L{kqNjP9)!W*y&h{5H9-T
z!<ygsJ_Gjwk-kSqtO1^p1NYP2qxEx4HZd|H_8^X4E7*YEtY|nvj$`OJM2`AJYCR6l
z0>PRsE;Y1PqD?LL%dxGwe2kPyD+!xbA|SPaO)<fMlQja88p<I%lmPp7F!IWfOlDGB
zn>xFnZKYG#?+w}41TQ@rr%<_E6~0nsYgg^m0;10$tz$`Ej^aqp32r_}8(15MQahqH
zFBx1~OW8ov!s7d>ee!y5*6j06Kclvz*m2L+Q7xBDR=B9+@C`}hlrSJ$yJuv7ss-|-
z7_Q;SQRoi2E!2i=JtI0?%LW#x=F@E$bG6&GpjjlIg$l?61!VykOS^fs1Wl%IMVD})
z8qYj@dS!bOwxNr`1+^!rwSu&wOAqG8X8tut%YAMXOk3uJO7pVd7oT<8g(>@7r*Lz+
z3q4u4*3AsccA$nu*-Kf$@%sRSWT+Iw!uPd-08~c$*wWT`$Pf9j)ZD0=EO3YleDz<5
z?+?C5N9l(3_wL)wO<V>YTMNOIDc4d8Qp*@M0ApMbh{#QD<Di+rc{D8rnahYU_;LpC
z%|J_6Lwh0(b4mD_dY#}`hHKh%Bv&4_W5RO>K<D%OWVLm{<>t+V?9TO$u`RBn=?jLy
z)KIGXKsImBfHGk8=Ax&eZs1^lvMcw;aw1uhPtapr0&0x1SIDM(_q+~pb~$Pysv`Zk
z1sVCCmd((<@P~F5Snw&jfAOIxWG@2GZJZNQUXXr9V)~%l={G^gt!h?$cp+|)n9JE9
zVtp%fKWqOzHrWHs3D42Z*^N9-!ba301)H0y8VGeI3G<<5{8gHnW;)`~hCxK0yms6v
zHU^;;l1}mQ65+vAFch8eup_gD7)VIm_?+5TlUgd(>DKXt46!oAd0iR04Ncv34vA4p
zlaAiO64fTRA3-f_w7<GZv<59Fm2f(bZ-Hn158>5CZb6|4v>#g5i4e|wUDe$=D}J8M
zh|YCW#QiFQL;skFNKmfnP3A7bjoGrlunV+CLnkfR-lS(8js9H~b%qR$j(8k!@%l;1
z<xQL@Ffd3s{yme7|GbPvoHKd4EKi?&;wwjilgsQkn~x9Kuxu$*`_DUTlHrHif1&*L
z($D9rcLbm7!Q;g*XZYs{-RC$M-=IFgqXXdHLeQc4=ZtYAw7j9eG~a8|Y$+f1FHldj
zd}RQ-(GTMI5p*-;k=O9KfXa>wyA}7KtYH!$oX_#ZD*N~UN(XsdE=Azuw6!I8ARs3R
zARw&&N(XBrcOwH62U}Y+6IUw-dzb$@tH<QD`v1QMfUbeV7AH!;YxzDG!{Vx2iaQ~K
zkP1<b+wG#vB9S{LWoMZ5lxvN8x^GeLu*Q`~Usj=H(#H{})24Y44yAPp%g}-2zcFm(
zNd%vdDHV*e6v~cmi(B#f{2x>u?j8F!mu&g;M#HSc7}lf(tX_pBxX2Wmq?7*s-8&kD
zz9Y?0uT%=!*p5`;CSJ4CYTt0WvSt<6mkmn{DRzy)1D(!(dxGYBW(%juKgNTjH0;PK
zy|It}<x5<XU~`4&$2|woK0n=KfZLmEckWML=wj2)D-OU33v$eR^!F@-*dVEtN~ZaF
zF8F|J76S`17;QGgBCW*~Suj-$QM_}~z2*xR;S{U>KN3Z>yjcVuu$3}z0)`I;Z@6&5
zLEa;y*->2>mIKPdS?5;sU>1LJ!FA0@u<SZR3`4w_y16kULMUFUK+ZW#=iiNHDV)il
zVDX#sjT08EZ!Nqy7C<dP&CBAQ_l3u+b@P_-?aDIF03QgGEeV<0b`ym_Y5&wU%SH20
zrB}0m@nv1{%Y!Oe@yRv8s-g)_EdID-51FG-^Y|*$6tM{|eN^p&obtI_snVfXkyy3P
zRN{Tm`1$hGKXVq7*F!>7Auzz@E=+VA=r>jdaEkq=O*U4odq;aj!=_5Mvf175?ifM9
z1?mgT=Vy~UheR+y-hRVoaK6B3x*IPrRO9pXtvWphLZ%zdn+co9t#mDMrd3Ki(>uP%
z3*O8`zv5xTxpXRf<@<bzs|~{jG3_^-BGnKWsYI6*xGxz@L%=Z5n$;dlwoVe6Jc2So
zwJs!rV(z_UcH{bqRE7pu^2HRc1~Av>(lPJ?0VAiaFxB-Dg?ku(lT2tFK(FGw1_Bx9
z4MzGjo%#g}L>oJADLJDd!!atYO@^KVLLpM*g8pEgUJ642>3xlY{)^ZJfJPq`7lyJ-
zDFKq6C>@VBUyv{qq6$<Dq*FpU5du2ERRAXT45x@QM2>Mo9OOia0`U7w=R9%zW1p5>
zN1PW437rGyG1U87U<2c^O>J@lV~H=Wgwh~3MC#X!&$P+GK8^a(+Qh=f%xVw5r?tu`
zG{#&Rp)^mJEqH1xtI&;3rHfLUqbANwb_cyOnf%^x3r7eh^&kAF(8(vAYGKJ|S{Qi`
zca83cGr@pOPD=(f6N*QiJs2QMjDDkKg+<TuFY3pv4=lOawH?j^bX~C@2UPBIh<^ZY
zt?{^@^4J9Rh!6~YX)~GJD&9qDMp{fuyF~Z6=u#x?m$b$U8V@kEzE8|&eb$nwMNp2j
z`89jR057v$SET_%*$NRNj&5biwf47PhAb!<v;XgXOb81QIRL)mCVc^dt54$ZNx!-E
z0em9bk1N(#VhWlz&}5nN34gzpsH`B^QI=qZn!Iw&Cn5wy0SHf;kJP|kSbCiU4J3T~
zRu*z^q?*fDvMgekox7IBEz_nDE!8kWSE$AFrm%5RO;~(y?9U(G2s4sHHfvNB3dWf#
zD_<JL=O9pLh`C#vK&OcCTKZEZ5)Q^Jithx?6}pVQB5@$x2M{Z~)}q=$320c__85?z
z`mTWM=6at1x6j1-Ij<Yyw%3ud9sFM1_g;|%sc%e8B2n@bh|0pRs_3111ECLt(ldHb
zjb9fWa)QQpdy+s1J1E8J8SP-y&YFr)1CL1QyUCXVxZr{F_cK36yjaIIFzwqRi9Q*q
zo@75s6Eey0_MEbBu3tEIuL*rdWKeqK#yz^>z(7N=CV2!2P*UjFrb<S;0m3+RUuLUe
z7|UCAE8>`aEmF4U<uf%TcIFCt+&IJjqGn_vMWy;^f)vScUY#ioAfdEoOMCOs=VA(<
z6#LCeq9!ZMW)4ox?J~P=G597OHUOQ8&zA7XbzmFuteV|&sB6nPAM3mvc4on*c82Ac
zIewwFR3gYIZa&eSI77QZM2~Q{5EA4uLL;-Qp}a;JTOz3<sW-T@2vY(8qrkHLB}?%x
z>=cGM8%j2uXHozS9G$`>$O3MG6$R<S(><7qMf03F|2%c4Od<in0*@NIIveNl4wm3p
z(}FXJtOrJXIx}g>kTz|A(S1BanF<_KNK~;4j5A&%HX`+suE~<%WC1-@Pf^<}n8w9w
z@sjmZIQ$e^)1(C{aUOH^<7fU1tlwAYkI;nZ_WC<~zOVk|SLoE&K3jxk8z|-=;{5De
z#dUkv)d?_6ARW*I&JJcL9T6~vSf9+xsfm;o*rZjOAP#ROEOnZ_nGUEC%ys+%3e0V1
z75x}QdhlFOvDul*jybBZ^n4~A`5|{d6~WoKbDX?XN*SFzZD2ag$IrpY$ie6S+4cGP
z!N%>t#=&_Y-}}<SX4xku(cs;VgUz`d0Jr3rW1kp+IC}My<!WQi%T<wA>^b6k!7Z>L
zjc90DJNSMuGOqX_svk3rHTXQ$i^eEQJWcacAEfe(G^WD5CC%Li17(UA=FJ0kIfQ6M
zx9mPj2*4wRCm8*LLYus1lu`5U?CrD<ihR6n=e4QAhq{p*gh+A$vQ{%$&{k(3A5@QJ
ztT6a`fKz^tt0)L=az%`Gy2ZRE?2TbkV<Uk~xOB$%bUZAB4-#+SLjx*dC*rH-3dFq$
zWh`kAt?82JLMK}<6RfoRu!NWgZo4H4bnsM02J>fwjgsmCO3=u|B35}?rb4^gTotu)
zqq+q#ISK#}JMUoWWe&HisAZLN|GSem@)v0^1g@LLWK&x$^N@4<`6lvwD<*Ci*VU{E
zdSV(H5Rah8Edr75uPS1)BIb%~DBQT<$4(Rh^#Z5H6W*V$K}J`IJFX(CtWjp-Nxnxc
zYTv8}ZwrnQNK`oxivtnR1B4J`UX%z;o-tFYn2lRfW?Znlfcz!G45g%P&7?fSq4=B!
zN54wyL{F>@!L!n*i^~S4q`VtL@FssE5VUn1<S(?*XbvCY#{NoI+}8ZChm`yi(hLb%
zG6spK9_9^lx$KLurd+<6;c}C=jURs$+2u_zray#0Y0t@CsRR9FS0l874(H^zz7=r4
zV6lcAi?VnkFmwusL8G^xe5Sgy+Vg&-iXJ~CtQguRAtQAIi5lCl&B#70LO*5-(^pQT
znOGftrnqyuZyKkrD9A8`t>3aZ^J|JyiA!2h<@mX5*14Ecb$pICEwgNep(hTVAcci}
zGiHD|@As4!|F8FPVyOq#K?il`x%p|g^Qm<Wu0BXM`5E3tt|j%v@bY2H+g;yQJnr3#
zJ5RckaEnbYFwuUf6K@ykx?xQTBDP)X@i5UA#1M!Pd`kafSRWWrQBMahnDwV4#-Pid
zt#e165DS%k6^dQWiS$ypa}|=#bl?d_>-hkti=WTX$DX=87FWpjYGV}lzy%|*B9gX#
z{nD(t2gzc=^duJ-VKYq-5%L;2!a(ESM9`|J3FE`|P?{<lfn_#g7a=bCYVP$#ash?k
z3Y1z{fTiAt<1>{3E#pzgrKqA*>9F?}^*y`?%4kd*_~?cEW_WBuv;GiM+)<MTq`0@u
zTN`brPRzP*6&@8N7o~~Wt;r=L-MuBreQOY4NzkYEA?GFTc`%2%C2mUZ%U1CL!_;PS
z*LYzyqxWs2=4)i`TgLFvKad$qU<RTr>x(u5cj};va3dcNQ35x0RMZ^dDgdzd%Ud6i
z`S@{*jCM%~`-u_mP(gmztmU4NudNRcToP)ZZn$SdJY$*QglvNdC@5>=^AKj0x8};i
z0=D80d3rKRyg$3%SU9l6wHyW)55xJ;BL=@W5%09k%hJnNNsS(@>{*Q>(ofU1T!lDq
zZGUIpHQ%`gEP4FdqIb0uQK4r0TqD&xbi;7V&{Sfly-$x>#EBhCFps+FW3B_mB)54)
z^{A?_^Mq;Q6m8e>(%K`lulMz_O;w9Q*@9H;xbVs=^FiB^2_jFYf#9?ZNkTkYZ-$aa
zj&W7e{@ACLqjndE6;C!A(|)Wnv3^<Jn69SiyrsJSV1fQL_1iJY6Mc2p4zOg?qws?L
zb<er%YKBa(DBa(KbT-Rg*-hv?nq~9#X{^O2M7W(EzRPQpjrN(YT`{7E2!Nvx*HNq5
zv#zrN(C-<Ah|;t83VDyZxYM1vr`jXj?I!Ct$vhlJst9;hYZDXDF#){e-@ev6uA2gT
zJ+qpzFP^#VRVr*c%|)H%gJpQ=MEdP2$<>J~NU+gHtk7lUa6{aDSICWA{j?>!?_W3K
z{CW4DIz0^c{^95Rvlyv<=z5|aho~Qe6Z0?G13W8#;%f<O1I{vRp3BTqG(oki0oh}7
zE|7mOjqN+k36!mD^hMOC<_-QSZU1s)u$<kr8n7)DNs<QbRyxOrJ+*LUDq0<J2mLEA
zDJrPV(n7^{Iqdmyf7v#YU_q{uSj^cYPH%R_uCBl{31qhO;(Z(T7@*U-U=m|EH4p_U
zwg8Vq*33!r-`lP+*4QiY@B@Wdtv>UPUq3AZkhmHswN=6fXK$@DH14hWob5p9#<2e~
zfwg%l)`Vhx_;4a0jMnOhs*c9VzaI(}HT7mEMok!#=cI4u;=A8E_{RQ+|2pTxZnoeA
z{y>8}ja;N2S;rjVrYt_<!Xc9I3;yY?s*ya`D1Ht?i`{$CfaHZF2j<g(&I{9#I0FF(
zalRyEQw$5AM3G0F3t9Ml6iOKJ<p<ooMA)bDJzY**&;5JLftP)s)srX--0-%1{=2O=
zrd}^%$H&<-={8pARKMZ9f3&cF==k(<I9r_4=iMkD_J4m||0g8Q2ZD7ZY5)OIcme;P
ziunJ6L<=KVGY=!L{{|4Rbag!bQxW$+tv@S#%_T;KO2@Zwk2pOOO1G4VvL{t?Zvx;o
zh_Ono69w**WTzaxU+ZM_0+*nX&r*Cg0HQ=ec2-ta)K(1CG%lmoKaCZpA2=s4NS#=`
zT3!v_O4e>)=N{Un(kfh%Ok1c=YA`NK^rOFCda_s0dC`@9X>LnAQ&QbXRh%gPgKa9t
z`u}K%eKK;qvn+PnjOt&-)1$iO)W|PNYeuNGXl<Be%vz3-FG^ANdbv>5=w$dRrbSam
zs%p%=D!oB*UUcgMZJbgqJgBQtv8ZRo($|Vqn|)i9wPL6DWJ)HzWDSoncql!yC<s+M
zu|X0fN|Rfn!4>(`S?<+{0JpclJ!`lV5aw#}!!kr&s+HPy!;QKXvJ~jjwGt@_MP4c#
zrO_lwHPLujmQ7$KHASpKT4icP$98Gua<wG(3rB!vUOAy|#+*slxD|!Ci}U~ZK+XP|
znYHdi`w@y+i!u@5uhcTJ(nP>_`W!#yYb`ylrvzHnzgEh}tg5T_46ni_L0y1uydC7$
zIRD0a0^3<+XIe9&Y=eF<lu|uh4>TXt@Cp^95qE9cmtvLIob1uaC(>#WP|G|#1RwCP
zbQYZ|QF>Fdn;t*naBU4*1L860JkXWl_;0aN9FF(XKPif(V}mMN7H7KCi!`s3Y#ris
zue>2^D6M;ogd)GS!+Yn)!!0MRe`bP&N)kQwr&ao=Cy@TeTZF;vg%7i2^c}Cxg%?mQ
zn&wWbfmPs7C8`B#TCRVV#GSg7>5^eosV8$fEgKLuMW7KulxhnM&}f#s1S_m5VA0L2
z;kh%<ccsv0%;=URnTqf90<4?z@Drh}k!uB^$?Bl#F{>IsI90%B2fH8E3ZyZJtg5xw
zo>qJic{MBtKK#(mA(fUdpXX(1oOVmjIf&v_(?IRO3xD|WkX;(HVOyjxfX}{oi8vHT
zKk$EO0^t&;0u}!r0}czsJl0uKjf%?ekO9w013!$ZPPW<aXzj2{o#hU11gyl`fjnAM
zaslg?J>6c<*1qU^`CqSC-aYHTpL^b8M;Z|v<tb-(Z*SJVe((vObAVrqcEn$w)iwtP
zhB}J7i}y7Hzvnccx1r;ln8&k+i^EUCFAN5}h2rA-6SC~w+&BjNf}!(+g}0oTrK5ZF
z_+p(7rFaAd1p@*CKTN%Z`dy&>>znzL%iR-1hTgckKAw))czSt-F^QCv7#Cyj(-{7?
zeBGUW-+&tj)fGh1fDi%e4<+`hU+DBQWpY|xYNt_o#;S{M%6>#b%e)|R5QeGzfi|;!
zq5eR_KZ{*6`;ui;+=Yeh6vl(dpwAfD1fFkgQZQ)QMUotUk7Ud1PF$%?J1h+mfrmou
zNGA+&C(NafU5p?&n`(pB!JvRRn7Ff|$Ft0{VMT~b<cv>62ebp$qELE%eNf3?@3EIu
zH)tJ@l2peTRdg6qRG(LwE1mkXhEUI=Ov>RZjzpFSaRXkqEWY1uE-;+e@H{ieYKT)l
zW21cQ(7BgBT_}`+uvk*G*xhpLo9A-ky2J1@fG84CB^}DMzkyk%sy|-yaFm5wzES_I
z4E5wPA{{si6XSS^$1poYpAF$a)I)gtM93f2fDG4~z~BK7#<5~xEUO!rG4YAhBmJQ@
z(tJ#}9Mnx(HO(SAR>4vWWMp-&%-%q0+<%_9OnHPJ<DAEBXeBEL%`P}fpLZNavOyr?
zn<g8^3<!c}uKq+wyu-6p2)$pHV2k;u6%dL~t%e!X^d@Sjrd^rWV=2C0^siD^b$;RC
z@$SspmlIp!^r#IAJ|0L_w7#C+o?hUY)n1gW(#pz9Ba|^mS}f2_Fm<KB0$mJVet8WB
zOoT4Xp6Vf(1HdI_cxRPoS|E^?ejetBl$cq^P*i<C^Ne*Lf8M3c%tCCGvMQBSG@Izo
z2-*<LOzPYw9r*4P7;#LER&QjMxQs<fL@Lw6jD)2O7KugS?c>Re&F`P2&D`%cpwR!&
z-&&yl_3?4Y+S$wbVb2S_P0Fz%$cyVe7`}&Mjx+O@PSiQvM_%<04fw9v{Lf}m-wkbh
z(tO<f8ywe+w@+0gh+0k|KHNvbKJdXtvtu9OU-Ou!O$c8qnM8t{$v{UcWywdKJFW!K
zwB@6Px8|9`^l}ac-7!JIdBq$?kVEWw>nA3pH~c3fWvWaMZ$qTd`?r&s%UY^*q$L$y
zmpd5Vz{Q6(>KkO&&Za7i$l!q8-_W1YMz9|W2zW37m~5RYHUBaD@lZYifzRb%3!672
z!D;DLo`1~S2Ppg|+lsE@H+z9vzh7S|^Ubh?Gg>l}(-dRk#9ugx!=R>amdBRhSLVVA
z!;Ju7BtM!dxO7XwZ_i39z@enm-H$!`4!Je{**tQA^f}x~DBwO^C9qCdl^O`oSx^hz
zI(-^7ar|0bR_~Qc8p<09xqu<DxBpySP{b;Mom%lI3_F0#F@|L?5yVh=or%$Jl&SXE
zDBqfYom#?Pv#h@cUbP(xG~050SqcXroG2i=+LPtFqV~6w-0N**9)9lum>VO`+$uP6
zzyASW{90pKyPUWH`-D$;{ctz8zt!2*$=%)4A1{v(5I2%>W9I4{Ze;1|_t>;FN|lVu
z7**)-o3BOa@1~3Z>L6uDDiH#S2kOQ$d2)c>g|aTWik93~j!FW?WR({5!}s(eGFxNV
zC*Ne&j~0S3Aeeiab(qhehrw0b;<El*^x6~)qs73t%}j+VO2@@Ee+jFDzW-PGUSnEX
zNP_j6^-q_5m#b}p=l)JYO%vlN-XZZ5)H-IjFx1?~sAJDD%wu;lkXw;`TaUq&#8Ehk
zAfM!t>#2r<A@s$yc*Dfu>zyCK^hG8&)l%19fQ<oYyM(MJl5(g)WIb;1j_Nl*Q&cY?
zy3=Hg18%;?KPar07@h&K(U9lb7(AG59{;d{ZuI~m>s$V?BE3m<c!*x=uedAiTy_%}
z{4R}Jm1*fFUqH}`HRzr+n*u)wC}P5oD4Nv38G$>o^2wh_2nT!OaEIaMzOYXQklHH`
zU*&=(o-h-9Uc0IXDr?^=Dd+}BMvz72x41m<m$bVNlh<-9XINXK%*XE~tglnROAb2A
zrNzmh@ezFC_@;KTtCdg6DpjxZnCt6{p$KijNzLy!>zCg@UGkO>(tlsLsvjDf?;#&@
zU`#tA^r-B5ha-j1qtC3dA5hea@ORA_C@;F~z4v<U@FFT3rp=|uTR?3kA%5FtGTIEH
zDSvIY94O78kg7sF1niBZzcBw$4~njjm6)4Gbyz1RWgqBcf|l_W*qLb<^c7Oe*i#xq
zuhYx?l#)FD7%H=OQ6(ZU*1sWl^!*uKoxUb1V&?6SxnrD`HENA#<M4hI(WX0w`&^-#
zKxvqNVvBK^JdI>z&nM5mTM0d!oYh2QA7*4iv>qS;Y-EW@`F)@gDHoYED|YK$=oP3F
z+5|O#bY?`S5L5V)6`K4>r-o3X4*^OH4_5ClUCu)mKj&0oN4RDtgIGH61mnRooI7yk
zKEBInC*z(k2$Y=bS;b0dA8WJKsLbvX2RL4pV4ci8)TE0SJb6O2k_L?|LL6p}4CwnB
zidNDlQxLjaD-rFQXoVh{(zAPkf~rDU(2)(J%4v#tnA((vNd$N`Yl76(q1Qma%%aIe
zxxo_#rLJ`|%cNoKQ=AsyaPLJcu=R_(;4&c;iHz!Qky_{|n*F5!ek~L<Z_dr3NQ*h3
zTdt@2PJ>pXU4c<BmeYyVW36MJ2f+ItGiK?V@wN}O6=GPFDVmve2n7`DRpNWo(KHhp
z@`gJJrd#-AUPqf@vlI<!!eJ0}UUuW%hn#kY$AZ-}<<o38jVxdHsZ3`m{smdm$mJz|
z(H_#KcKzDPE_v4FmHSK^z7PD%lWZKpAv`hJG(3_CtJtJ`af|@0%NsDXCu%W|Gh4gN
z>>wCJr0ZrQp1i+Ye9h=~Lsa1DYn_=HE|SkNw-K+{ui_fJ)GsIb>6QaSCK9ZlO4Z`K
zKv9D^_E$6;y;g^upN#S>U6q^P&y>jTMmM|KwI9RO`b>C4(84TSp58V?YUQ50p=aa*
z0g+6^DTr!@ZFJ6d!EmEh;QERQOw&wx*|VQ!L}67;`jVDvB_<7Z9GnYjQB$^sQ|WjU
z*Sg&2wq(G^_oZY3_ctnq9U(z&zgNn^p|8aHFL<+75IF0;y{nR_nG17kd9zYZlThud
zl_g1&@Z7GFkhDlKfiXr5%UY`VbJk6CAqpslZ9xXnS=}ZEfs@SO2=_RSW5T<GS}<g8
zDVMi`IZ>r+FkAy^cQGG+{~$F|TgAX^nR!#u5D0hl5Sx6d{yzN}aG4Anl012-E3~Gq
zS7D@HmU^Y&r*J9-MfLB|)(}U;0tcqWQF)t`xFy!w1fP3<(nECt4AWWkN<)eJUo74h
zzjHp$=r^ayz|Py*XWF0pN1WX(uqoB|2;3LR!4SrNV1JY6b&@bk@HMc42OM^MNX@%;
z;cgHcMZ%LRNj1UneFt6%x2zj{zLhb8OWseL8E_*rmF4)+3Efj_?F(r8RMemJT*L>>
zOd?3dWY!LqpxLb2ePGKBH9X-rbahKL79fCG^5;jKWN`}|0sFP2em=!~138vJMf<)e
z!mWatf5tFl18RZo%Bf&6BJ~j|t4lYW``TiV8?P~vvJJZFap0wSETC0=LQ4ui9JU7j
zVPy7SMh0$(*w$J5i4)!COWBly7aa95b7ZOk5GX5bkG~E~$ej>j1~#YqYL2j%B(05l
z+oZ5r#A!?7!p?&hX{2Jk)OI|1iD$0{hv&2jQ;CAsg)0^^(S*mMK4<NCUif(mU0y!N
z*-R<kJMiBt(?jl^Sk!y&!D8~NJ>VN&B!oZYhBIMRkbak3AG2Z<E^DRuR@#TM6X27r
ze4@SkS$Fec+cBh@v#%aePM3iAQ-cYz=Yonzml66b<Nf)3`#;>P%sB3k_pN{J<^@Q6
z#b?1D)7bxGg}xtu{^ZcIriO=Cszi{|xE#ac?B{7Zn;HREpevQaq$d+!jBIgF9V;*2
z)@TZb`?Tf}7XM0efkDLFHTrcD5A)D(E`Cgx>wFW&&HHkmvojOyQf$1u6l@R_d~Knu
zxYtX<thALA^agkbf^lpOnSZa+%H>qiQ(-gN;<pT6v}(b*7R0r0?CUf!kx~X!$`WkT
zr-kc5fRcHO6w2w7f+e!r^}mpn!xK3Mh}@p$6=mC7{so2HH%wavWP6MX8myW;M+hZP
zszSrH0>Ntd1+Q5#oQmi{{lI%PNTSzoiCgS>P;Njd4C5%mUMJ$+9L~};&%jK@)e5<h
zxd{e8h$b-h(R{>*x{=MNJGKq@6|sZeiqY7!+~L`-qcO1(@>WrZAO<ri<qk)zGKX*>
zDvm#qPi$LETb`e6%yJtAL;t`IV;a2WYPIl<8aj$~AO$1mGC2N>0@2g@3_Kg5g@)1n
z^SzS#L#NniEG~KuWHbkKigqy$UdWrL!nM>9ERiQCb7pOM`Wg#6e?a0D9Jys}=Q*zO
zH|O@>>5k>?MnZVGO;z9?j8N6~0Bf_*uq1K5pJ!+4MzGK-cxhC;>+OKw1;$%$X6W&N
zyCEx_6*j~WByYrky#4GeQ{zwp9;xkdg2{0*xsik*PLStX)^S2Ykp=wpqZ#R(l<)8N
zE8N9@P!c9={-3m(m&$A^|1}2v;wSajhfF<-+cXQRi<2+rV-#B&-2yKsd9tSODImql
z^GW?7q)2iQOyw=XGujdepSn})dRZz@h8GF6DXm{)QSTY1)o)mXu!f$K@^Xl3F2+S6
z@G%mXC(2hwE#OW0%*_`{+^#WRFT9q7%Ki|&bOtqT6mAns4KkQO%yaS}c}Js)9yM~0
z20x4jOn#mOLuJkP;%jKRnDaSiLDpQgoa$6=a^4$8c{VF2gv)-6JIB8DEBBQG20zob
z+;bpqtbaHC?p(BKqH6)kUtvtZqhyf^ALdS!>S*ZEq433?ZJ;{k{`~$%8^r(FTecMJ
zRUM_BD-=1ux78X3ue`4@w1BO@_!2sOQNPr3Cu6<^yw;+#y?-MUdu#mci-a3DzX#6%
zo1H1ZOyX`b@Cxfyf70@E)<6C7q-c>Cm&}+iIVeI0w=u=iG2tm9Pm1UBG90xQ!tQ8!
zv`sB)0fE>}YZNrbl`zFF^kAc2qu{lm5Hc9+BK3sN^1_gqj#Pps3nk6xI#1<u|8}7O
zxt?-b?g6UJngX{6TF^0twkeHjb6_~P=5cXL%A9%0s|(LiGJ9$r{52p_0BH4_pVh&B
z){J95^nU<1K*+ym;f~YHFjN%mxevM%rEST7<g~-PBemrwI#z7Hayql#ghigG!`^Rm
zSJ(fG?y5EUFYohS|L?y39QXB$4(oSuR@;sWTdWkAc*j%mvwJa#1cvEAai)yDaSyax
znY97KqhSbM<(g9J&c6}Biv&wf#B9YjskuZ}c;t}Iw1IBQIc~f{YrKf&jl%;iZ_41t
zBh1TF!d}OJUT)L%nn^yO`%V6i$@pD8;ck08J?ecBNGl4$vNMIL&VgqFmTXHEepxRb
zP%5J+Nfnt1M)`c88|@m0a%El_Z5Gp6e}#gjdlQn{8GEBNN9+Cv<#IWl&SG<9+e@IC
z7}*@RIn6iJ9$YW%aG`m)uRRSq!rhM#75LV$7@(SMAk>j5pE8r31o}?<5q4^4k~DpN
zcDQ5YVQZ1wVT<F$zq;JIn~Dq{cr5SCd--%MpAV7=qxnzSqiFGT9wr7_M?(&FBDRiq
zZ2M|QMXZ{t_7|fi^(_JS+tHXQUwePI3Mo?yZcJa94=0;K-C(_RQl02&Ud#vW3AOay
z_o0r#!7d}X@=jE!!a9}NA|#f{W4al*b5Qh9cUVDJ+sYYmL#TcsOCn)6u){5BwLNUx
zRa-aBnaqwxS~uz3X_l9zs){^|>6lA)+27@yd<CM<*jy_ni_A|F;7?2oIBER@-<m8%
zMaA#j#P|$MN7D_uWtp#0B3P8;UAgq+VY-gQh3ktoWK*I(oto{VYk+pltz_vFam;@!
zLe{lf6?BZs2^C2#^%AD}qQa~2roP8ASh-{g3yc}A%Z-=P*Q19lBW*E6!CfwO&D;5V
z+ko%35;2fcKR^oL#^*Uz_zhc&VXprT`0A1M_S?&XEi1a<hek-Pf6a<?=#Rv;BZ2n=
zH6Qn{Oozaqvp^jt!jAaD>u+u*(be(WtIu0Hx=xAmY>E56mBg?UJp`Zvx%=YX^+j}g
za{hXJMX-%dd-plKLewR$t|xRu1SX(iTVBqRTW5>rINrN2ZjM^<icqbLLg-qoIUXnC
zY_O1#D+8DI5D*jTELj9QQiC4)qZ;lercp3FROU(%uNV;nSUg(S_t%jAN^WjhZX_}j
z<|ehvR5R4WQbIGG$<d`hr3~Re?C@>XOQSr?=X2Q9p6Tx_Yk08FannN={T{sJ$7|vM
z8UJyW5SL<b``0A-YhXqX?65<#>a(SqeeiC{+qDl!s@qopkFbNT2eB4Rg;Fx!@FL7q
za;NZalleO=lxR#0q}LT2yl6na5|p{G_0}cDA_okMI6Q2%HI)UrN&<~@WA}0DZ7P4f
z3!NP;U_En%LoMfGrp_0lbT6Y`IEPO8gA>n47Yw^6+^?#0eV4{kYpaF7>gMw0ez?EC
z|ICDTxP4a2s!7X_cyBEs5dYoadCPI=H-^_OSI3uC4RZKcP`TS*Jn*7-YWjONsW<*E
z?ps>pCD;w82jaHh??N~0xd#0SSFIUq+O=qX3!ZeefpxrtZajM@OR9kZgUxh(IJl3p
zrWohGD)l(2id1gcF<PBJ=BKEeI28SzBYh8FU#=tQnC;AUxA|gCfoB+M=(}$1+*CR8
zaFhqOaR?W=QE3t%H+oF*giN{5LS>kX>#Dhy$%xM9_VhPfb~{hfT9rDQ#F462ofePK
z2iW?>t32;h$&|}KoNeGC{ly^f_e)#|tiYorAO(={RBw|LE^#Inu!d09;E`ZASxDR+
zi481+!n%1An8WUpb3@8cG95n>Hc@@>Kka>aa~nsN=l}W?)zm~7a6@QoqGw`)+w=@g
z(K6d2sU<199kK!_0!6Yc5P(qtDX!Pwee+$JFRub1$yaY9x-F8(`*``z-y?rH`sD=T
zKtA0G*a>?elm`<kJ@L-gFk?@^w!XjItKothd^vax=nvM(lFkh{D4NzeTw%$Y1|GOC
z2TDJ=cD&K^p95brcS1omAvFnmz#^*EtQnw!W}@|^T!=Pi2u%#F2GS~W!&z`AkC+y*
zdl#xzh%86km%wOs@N(`9K>DFOGpNTqN&*bEpI;rmc$tmq#Cf=4r(gX2@Ss%}3Li6q
zzTbZ*73&F~NY~2;w!5~VJ6}=R77GxP)~y3cQOXV%*ofdCYgrFo?4O@+w5q1lf4Y@q
zy<D}}>EeA4zJ`J6g*FW+suuedz$fx>lNv4*0o~J4wz;D)+)_OO%6my_?%@WA65o(J
zTLS9Sw{|9hoZ`Qx{%*89t=_BZe)Qi~c~<$~?R7Xj{VA*mD;f&@LKfZ=S=a@_R$xMd
zxiccXSA4bE(T)sXYpOtAvK_RT6KxrRjhJy)=&|6ul<Fy<=^6T2fyM~<t6rge8kAND
z*=%}!2x||HhxS>;qECc}CdVBs$KV6n^F(cA<6#?EY_?T}Y%pst%5bO*ID&;8{My#M
z0<+O*1upiaMZaH^2=!DGN>CD_P+GFF7g$*Y19$`dC&%NKK`+A-`I8X9{Z|)Xc?2+b
zMH7Poftz^c;-{Z`ZgGD4xt})zWxDMZfion0_<V@Rm4;R9=wlN4Upcw}eqn&UfxBoZ
zq}lbwTXpj@t>`7wPG5W8ko=`;RwH!jl9fGub@B4m#o(LMBTJY2fS_i^zO!0hJ^3$1
zzOcSI=}41}h|%H3+T3RZ&5f!5+1csY;Oy|?)!E75`0)IE|M_8ScaMvU(`TmzY(#Mm
z`U`4LEP>W99RK{^`H|xH-E9ud0bySR4*sC)AkhXVM}k|Xjeuw-bwt31Rgt2|-o1dl
z?nl>hfyOM*BAGkHU`A@lhOPEn(6DzqDZ6EAEl|6%=UXw+^VXV;Zp-U>G$28w6B!ik
ztrw?reEHBFj_6`F#gHenmxDn{*$m~$eXfC0ot?pe-*@~5s=L4Qs0P>N;?kUv0VC!z
z0~yJm&Z@f`(<1K|_#~iV`-Gb_urRVQHStJ7z~3+`m0_2*s)ml;i8^jNEv!L5vo?v!
zd8MX0ucX+F#u8~dxmycjOgd!hJVuZT^`Gs-d0ZT$R*=zF(cW8PKrJ~w&WudubiQ+E
zr-%-U(z0ds<Ofz&YISQ!gJ)*DCPwgt{_(_%=G1B6z%|*5h++tWxL6ENM03O(Wxo;L
z7YL4iJDuH=o($un1vvY-K*<+;dPuqp*OxW7Y#XpBY4z=GHLgq4)d-6KbYW!pL|P*M
z;fC=|?m@9~SphRybd<)Gu<*w92#1b{Le;zZq#o5vmYtm=Dlfw2gEezWSdhXHWoyW=
zokd6(2-58P8p}tbEc>;l3T}M-P)&3Bz%727K8_omM)PpK15k<C1YJ+xJvdT@)D>t?
zL$Pr09-OZzQ~(y-1>C)>E3vIV6C8A8IxvtLJld&@MKWZi&f-BX57s@IEvwxEqE0!s
zj>9g+_k+ngQS)Fj3@W{p-6Ee0EQ_>2(7uaFd3!l7ot+eQJut`m4jwg!1fY+IV{N?9
z6InFJO&*?HoB<T_(aD8qA&#E45Pfr{aRXjUQ*Q#E$aZYwQPg4ix_SuM0r)WGyl3>X
zn05l}6KeW`OAW#m1!}>PsqKBhpq;28VR7x*@t$!Wx&_e=9O*T3x<aJ7A)^4<j<IMX
zPSCO;k7(h{fi_pDUJeEpber7aA)JF`r%xeB1A;a<TtnI4R&z53e@GC3I=SlhZL?u@
zSKE~9+n{n6mE#wnQ2^VHKRnVjCJ`JYfkXjQt-Bh)>4XO1j{^wp8vJPCJla|g<Bc|H
znXvy=%U0px+TB6J+|HIQUyaosx>@ixBGkxkdae5wuq12&N4rT2A*e~%(A9X8Rur;|
zYhbaOVqH}WQCkL>cwvdc3jEkn$HU58J+O+nBL>QS!~-zGu;0kKweEPl9S*T;FJTyL
zvi=rI__qY?L?|s7;T?wKuf41k9U*=SgNY$J>+fqY`YkUk$ArN@o<=~A!C&0R#kUew
zH-cHh^;(L9UPFKmjRpKMJ*!t9Rrf(_KY;U5eLZE9;B7zNN^dmaj;eoe5r&hMU%zQR
zoOV_E0AH?zevetPg+e@rY$*)d(FfT%!%PzcO)*u0Z;w5{%+oo@*Un74j+qnpEh>=&
zXj2{%+Azd{b5Kt7B{}B-pM;Mh@s+dbRecREpk*;su7Sgra~*9*Phf%ff)ooqjPTpj
zsTF#Ka%4?lU*Wg7C}g+fKTyUEfx)Uk6*&O!`cxB5Gl2>Kz{JK-j;AuR^=$EA1nsG*
z4;+8OG^%a7T-&LI@DJ)-3D*Q4F|5-{kn5zNB^wTekxFj_48Z{;ROSSi-WU|BJ_AF2
zvY_AtWblUP=PwRuenaH@yvjpzbGVr`OKbfd5+oDF%JP`dKM*yeUiA*Cox5aT<3>70
zwB|rJBi}6RE7FTB$-E2aB{&lpgK0z>K+h0{SuIkGH??MYyhH=Kt7i<*Y$k>oJL10F
zaU*CVJTOBzLnFzvgkmGz3I^4Jl}gF}fX%T3S$NR>V=|k)UCqhyx<Rm|2W!WqCTn!Y
z1IGK(Um}s+Y@}Nkw_1RcxJlcXnS<G?dW0??(5=F<fzN6VR}K4nC~j)>L?K&nBW70R
z436GEZ(W&aWKA(a?4>KS(WvW4uPR>C2=K{ml|zHSd{BGqcNYq_;#U5E&uzfCTbVx^
z3K0l@Ftkhl;&H(c#05T*bPKWN8aR4}w`_R1<_=Zsl+lUo;iJf!;KcAjV|3sJ2#JPT
zqaXpYHni$%0E{pKRC~s!Mu8&1$LRy48M@9ipT>C)s+DnMGTJ0L&r1Qu<y}*lHD32h
z-EjWUEX|9~peh%JR34jw*NydHlD0V(id*)YbG@K)M0rZqOL$Sp&j1vvP0B-3%~28i
zT>;!6vTdlOcNXO`-In7j0Di+E!D>3n8ZTz%4bdo66<i$)B+~%S4Z*W`Jt^1Wju#$c
z^?2y3r<>`*0pYt4JQaK|qzdb`pi;b8z8{wv_vgiD2>aA96c>UX0P8YBN9OWssVpe)
zadcYz7nid+K-uW5FxCUgYEtY(lGm4C*Im{$U_b*J*g|9=^m0S{3?mA8W5X9s@P!0*
zlF^7#XhQJOFE}!6@;vUzLm4oUgi>J93|u|?%s^H9OQH}_ygfxiQ-zpJ?Rn(QFX*nf
zDdv6&WOH{vKsI&11#M+cH?*AjDij!%L*Slw(xT}(WQ7onTt~DEC!EWH@GCIHe%HSY
zPaI|z3%m&g13_3{2a2F*TY@i+IopUW_W5ov-^1Sev}SABW=Y&qQEY?r8qcu(zZH-D
zn{uaiJ9jnvN0@CD`2<ptLIP(kKSx=Aba%@e@eW7#0LpdX=P9A40a+>Fi@@{}i18Q|
zjge>qX9D$F=Ae(HH=l42s+@lc%(a6IJuAlSbI6T$2p)3dCE%!#jvZzM{*YdKX3ef)
zgvb}9n_EEO<g{*XQmmzBppF7AH<%JgR(e7GtH-D~4h~KS;e{x0Y5;le23m34$|sE{
zm@%vKNG$}4Vhj%~Ls#37c$Ynburap~fS<@Yl0Gd=P|P*B(Lg7L@gqZbB%J6~XZJCi
z9;}DwgBGfX?UQ>gb&(aa;i8P)UHuQ4(k^j@C=+_V_FhY%S_6FG(ufU495gFz8}M?M
zrOwT7#-qVFFii`)2??tfajF@%Elb@!QKOtb6hkjj>yRC#Lg}J(2nxJF1h#Fl7p8{P
z0@Q=w9m-diRY(S5wL>171Z)xi(fKzRS)|A`D^p(~iHF6<;)ky1u!A0{J<CQiA^Svt
zNa<Su^qW}>YA4wg=M_aT;gOqJzcLO2udlquG~Bl}U`wuO^nAW5N0mx&f=Lo)@r`n2
zA)S|)F){C|92N$N9#P}Yc?KP~Ka8PAHjtRH-y1`39}DxX{W()aKYY6QAob9FOx_({
zZI4ca<Jx-OEmZ!CiP%B4>HJpKEDcw)-|vT&!MTim?`)#m0z4rAs&C<(@)Au3O<@Nu
zY-zE#O|CO{1j0sw<Yr#sjRl!31til;`nwO5g`TZ$ZyzwXw=exfcBd#Kvy>`F2<X7Z
zDP*9`{n>k(t-!f!+FKnAfV)_%>wJKip4K!51cU*8%rBR57HQ8E;@n%Fugb-E%>wia
zaZYo0FtiA{++2ZTlD~DKS$mPZ&^l{z3^0Gy3t5WA^~O-$y(dr8HFU??cG`hj-3=F0
z(!~3I`l6^zxcKM==?|{!yJ~=$3bi9wFt|3C)o3CTpB-*0b{L8BXQV+wMJPN$sC9t%
zNE2F7B065$a|n}Hv2-X;u#8Wd@`}{2x-F0cL|HwXJ{eaC0_$2+VV1xJJaKSOPT5a-
zZktw9Ni5h7``h}NQD7WwaISDftX4&ffnzWH`F)FzAEqTmgV4yllZJcSB{&1mj3>Tl
zkf;6j7Lvn`ZI%cQF4d?8(uB7=L&;J|0cT+um*<+c@6s?jhJj$Bk7ftUWKeLN5~TUL
zY4Xdac1f_yA#}5W6$DlVIRVOEVTFwTFJrn4H5R_1GDNsaC|P35q^)FiQyb_%;!3;B
z8=(xW^js3@=(JiP|L5&=%*=(8+O<u-t*W`o>mhpV#zGtf;?~1RV~~etLp@7iYM5#J
zlj3G*UqY`kmuS_W3AbQVs}?(vMcs!JN3KWp2RpuLXHa)dPr56R;uCx5bryeb9--JN
zz^jADl8-b0%g2~dH`S4KKgKS@`jo1l)f|wzCr23nF~^O^CHrj*dZKvDv)7G0IhaS?
z^CXNF@Tn1Gbn@Xf(6#wbZLE8ur>rKPl2YxFr%gw__Zfe4e*FUveE5KWQ^F!8SOE)m
z_$?+gL}!Sw%9KG<nXFQFCe@=p=MH+W-E9<TeTWcWq-sCkzc~DM|2yIN*EA3v3IB#g
zE~;+xbhu<x1U@vJ<HO@qGaElhbba|}hcDvT(quJ-U6}lS=lf~({WJ*oX^%F5Pdk*#
zmH`IdCq!F;zmZ3FwG3HFpEU)VxMss#fXzv@gj^mRWuz3T%HHfEjBq8x&Gwjkckv8S
znRlYdjU&T1y-PSC$BhVXyPB(t5F#c&DN4lkf;IB`ig>S>vq{kBUYeM76McBV-Rs7l
zl@>C>SF{;w{W<SKQp(sQj3)049-Z_04$gL<2~R)<$vh?g!Dgupb_lV9P$lbNbws_&
zvWnoUDQt4<0Ha4Nd=+WCcF}xEPzFht1}`<#t?!LRxo}8@h+~-4R~2z%LDm~Il?ZIy
zvUWC6*)~V^xpbQHCvHV>{WrRo{b;?r@uAVMU)~o1E8wCT)JY0SNlRST4i-p)P060d
zCnp+IVn*(C=|@K{562n<!sP>t8JG36G{5MapTXe^&K#zWVj)(+d!kS#@}RFl=a3R|
zwI)dvb9{jg8Qgq0l_LQVabe%TO^c!wpgy#6W<q7mBPJ<kNPrqcJr9K4^5<hiwOzk>
zcCXB{g^^yILlU1}^OmJ7EpYlm&nS<xcm7dmKpP|83X1WrSMrsHpw`69Xt%sh?kmpl
zh2|-;#-L6?Mr!?MTp!kvhKvYiB>&5_^NG3rXmSul764`{@AU<wHFFK&ZY`W?*fe$t
zssWC}GiTGKo6-h3G^;7?{31}$NO+uIujtGcJUC3aunZh4Lp=iqT#+n4Oui~MFDovw
z%9dph4MSkq2zBRz^v!Y6foXFIIwxu{(TVBOGF`7In_EKRK~a6p%p$^gr(#H(kELky
ze*t=_aIrM$%mRL>@$uP=UP?Yq(sD1y*$i-Hl20{!7z%Z8RW8d3!BBUk&2HRlq$?x}
z7c?2_8R5fAmSRF$*oct$gWYZ(YDE!8L7GUd6sTOn*F{xUF-oTGK3X$-0Z3V5Ep*$S
z5Sv{rEFJAItqd8-2T1NZ_}`v)x98K;e!%kwmNj_K5_pT0!O*75YJdEAK5|fFA<MpW
z{JPOUn{2)eGZXGFpY;=H9kD`QaP%j*CvD6;UzHY~m>bL3I--iIGjDq(Krpq*p=MEX
zvQJ4J4hIyKse3-s#OaBVw@;qA9$g@e&wp?{H}I-Xi%pT*8CMfZw1Zg`SVxbrljO!^
z_4F=w^%89H)zuZKgbcwqYjy3|4hLQuG0LjTeahhNDgI$|O2Z8aFU0&b^@_5o=LhA)
z)<)d?<>h0%3jkFhKN}3~rlXdzQImyvQn7u70vPSr@U{-X-NOv<y6S)@h^f%sK$EeV
zfGZy_7%4~4+l68HwA<|>GmrSkS&wnWIY^v)X?CI;^BXofy7J?;+GUhzzNr^W!=e_)
zCFJO<`fWjaDOowG&;K1O=B=-5N+}Pfrn^}U3@plit}9%$Hg){(P}y5v4S@B7gH2DV
z;g_I2e5-WVJ=bja^=EI){{l5F9WvVv{nO$074j@N8eE(m?jL(LgTy#J_pO8k7=ehc
zX=7b~+Cwnc-C7NF9{{Ns;531};#%slM#6=c>D3OxP+LXtUsoM}4-v@>i~v~n2}7VS
z`y+Njalhz=AQ^jVHT(r{!9XcIF}1yH1e&3B1gMltsGSHeA`reV9YG~fBNUW)uZ9*?
zWeB%pn7sOFMIE1EZo8Aca47|dMG8<2L3%t$M&8+3T!u(lIjc@wYM>#T(!$fGP*ZpU
zLKLZ;0}5yIkfAj36pzke3J$kQ4IzhJ1)tmI8nO)oP8G@}S&XVMWD6d>o!AVr!!C23
zpFaC}_ot^ncP(ODZsXb)!i+Bm0_N%EO*tbJXihb^L$Ng=^X4drsX%LZ3nyd8%<%iE
znd0};?NRB{WUqA$UKM{$ED~oz*0~Ktan1oXuyRJtS(|LqCtVuSL#COp$Hl%z#-T*H
zoRfNv2gNCUkXf$EX`+#UiZnB52n!dp)xyl@u4?+})l#x(P6})VNlS}vUm{*ygvyq=
zdkHb6N5XVUXR0u$0?MfrT1=-L(K)(e@thH=dMHjdUSa?mpT(?C($|{X35l%DPN2wS
zo43_t!1Y@yPxH&_3ZNG_uY=GFPVK-%7(9D<V!Evm>nSyY2oQmqY%377kY^CwWB1FN
zTT+qh?`dvvHWuC3`<PpljYT)rYHr!qcp6wuWdv>~MBnaK^>*CXE-VpSu=YIIEw}tH
zK}u$*Gb9ChcKYqfKn(txQioWVE%0Nhlh=Owq(r-|QD+nQlPhEnLw$X-q!%0f|IuYn
zSIk-7gEu~DU~03yXHAw_qtb6-I+7%Ek?BYwJ$pCk_hT{o#Z3*#tL-0`1AlhAh*c(h
zj>FWoorJgaaq%r?mq9Xyx-HB)gO3oVq^Tu!41qF%^ND%P9w0EGkvzw)F|vC#6nCQD
zy^ekJ*57~8zGkoV2K+KF66>k8nU?i#?Yij<_ZqX&iKe0DUP@WoMl2m`sNt&=+00`c
z$7f>SKZE|6VD>TMAL;KJ43?(gV+tp10(R*1!|SKJZ-PVd75Z;9413_4ECs+3@1o&O
zh**OA^prB(1Z{C|%*&{FHU$jfkg`+rTh#MT7xdXW4a=I1mnFs9?7Zf{=NQHqN_V6~
zryf7bsim#_h0d*QJb7}@4i65GzBzPG`YL;7?4d94K4=riJT0BNU-cy?YS;G`cOHa8
zO0qJY-P;EZiIL#dpnE*Ig#(lIB|+i9UnS9}dS5UJs`*Otl@^8QK?Ppj&0#%O8>Ix5
z1{TV#G6c;JLrD-=XAc|jzp#ywfi_4-fJK~_-=6C*t?;q|EB+tCE-9JBjk#-122JEa
zb|a@hg}T_;L@IBR%sHAjAXbo~IMs$L;8(PHRmTWo2TFI^umAB>d$fC|OdNgl7~gvS
zQc{uDjm48aA_n^}FCU&&cYmptBSZ-01VXA6%r4`SZW)q=gh&Ds*{cdlMXC#L5-ssI
z{#^4}DKZD4$1x0&b0RRAdqG92tH3W45Cr)G1Ym?9J>u5{u_u3q5ViQP=*hCU##?8;
zj11|>I68Z{DU(TwsvESY#3Jftk5Sr;P8NJB(jS#mfl`6~GnRKfbGo%s-Bs<S1{140
zqJ<RyM)^a&iY)TxH7*U`@HYB3Z8lH!5H+5znyeO{xsH_Txw&gq;N_~GjP=R;G&p*4
zlN23M@YG26t<BVx=|Z%;Kjh>{JVzTGg%6_{&s?2(_{aM$xF+^H_>)Lqpq||6|1hfq
z2Zh&B`X6_?fU0)&y^=w^i5Fb=rDm#a2f$lqR$B`l0D6Pm7R!7qs|Wjp+%i6ClaPS)
zraYx=hC;N8%c4NaI}U~OC7|RQ_Pmv*4O9o|Q+3GT6J~%S172$msMPxz6!A9oebO`a
z5~lgKd`(^tp`(nF_u=yxY|O0R;)6jkpC}10jFjuAtua(t(z@C<qO?H?=_N`2dI6)+
za}0(WA{=K+T>(^5wsbdnr}{=fT9%`mPJHCkA-QnAEJGKb^N)C%AOQV0I7`7{+T;L5
z{(pLRQ+c9(uN$rh>B%U>LbS^$D9SOs9?EtUOA0_55XFmR$wFo>*s(`OKzN+3E+m_X
z3JI4HTaA=UhOwLijF0oe4G;mpIc6}E=SJRFN@9r?{^mIED<Ww*J&;C{D4aiv_|;Sa
zw5)ACH5s}+8wfIktt6R&hpM+u2+5)D4P;yQil2TCzQW>|#f^x74te1ckiT*;pUqd3
zGV92Y6<Xl)wt7fvdxEu4S!@DAvbH8CvPsY%w<lf%YO<ARL9>vTUOz%cvhEc!m{@<U
z+$=_;z0{z3=0@xVGl^N!Or9d-5_36>Mk8oj=OBYmDmRhY7nPXE>?Xr3XnvMyRv?My
z`cBU@m}VG0=aP2@y?A0?u;2d@nsY1tAyPB!2V{uw6i2@B(vr~2Efyi}Td){}%fedr
zC1T9sgywJG^j}}DmKEyv{P=Q&dY&z<%q$czlF2(EnN5hq9q{3Y?|R+??8{JCfF>C=
z0R2cC8Ie08#&GE}&8a=|7y^H3&1p90j=}hPp<l7Xp}3zd8h`>dE82_<!CTa`5Xl*E
ziAEp~)s%K~JX;~Gw~;w9JBEj_O)DLuBze2=8s_!JET2-Z#!@z_BI$GmH@lZS)t(e^
zAqnE@#-@l)A8>C>`iPI~1h5U&k;jlWtcjyUA=yC(9#gN6z9F5aaJTrs@qT;RNrL86
zl<H&b?Fa6ww!4R1LUhr>dzSLo_T`IbM`x|#g?wD;FOLpiJX@Q#*x$-nE8NygUK){)
zFO`HG#`>@%cZfC=PyCM4S0izQcKCoN!aibJw3?4idmTN9zIi(8We1ilWFg*XBVEbk
z!;7y@H`~R{cW{&4`_P@+de=VAjtT3Y+Z?Yjh@wOmDIaD+XT16I>2Jd}v|W$=eQCVw
zITlcyhpc5D>3vQzE8%%XGO|IYbSn4!aBcee&CaQ^Lw%SkWV@ql&BZS4>EzVh!s9Ov
z?X`u+A@s?7{n?u^5R(+&48`rTX~QShC;!sdCFY%EfaI{~v2q@$+>e&4Ma#X<){-$l
z$((chfgcqFfp%;{HLMPI*%N3C2~9SIld67PnHH($`URx{_SH1J$(e!Nw8HiEp~L|Q
z+Fre8*^ctpU(U`^=JBg7s(GT`W3hj0Zh{-zQAj&R^0ii4E!Qc<7p8)b^j`av$_c+5
z7K=y=MhWDV608A`yb+u68%DKz<VHqT_rBMxb<RTjJ4QhRAT2}EPHO;XgV;7eeO)Xp
zcAfdWBKNPJ*Bl%1RZV4isL>E;Heoedf~)kU!{S^@rWir*BS!~6E^qk5-R;mrP%lMp
z>=jl2x^GnzLva~3HTr<v*`Jit>ppF6kWMwB1ibojy3SKzV9J;aZ~ps1nlWviU#rN8
z&GUoENdShGaswE|mi(~4ZA|%o0XlfizwG?k8Xrw%k(B3tN3xWEAd+kOVVw2(#s0<7
z!RCy$yAc0WEH#p-3^N^G%C$_jcksC0>)kimi;QT<Kv4(yh9zA3pI02n=7=vnR{}~x
zk>&37vk8tJZ}8))HbZ8FRY_abr&rVP^iVa2#Ni*Xv%@c693EU8ot|vK4?|&OhyT9+
z&HmuU{>k&f!HfNK6y??%3ccgDnIU<*gQf2pcup`MTYO$TB`-R1>HO@CV}#FN9UMfE
zo4n;8^)Lp<VyBhQ3j!IUejy6X(4VT|&?m?cT3hRxAN&Rb=yv}gn%$3cI$5;Y{!o6X
zxaOaW-{$aIY47z@Rg`otgNyH89%j*S*2(iE%iGLH=x&1V%}cZLNB^*VJe>FzHA0(1
z8Ol<2__6yoQ5%>=Ld&kEK|({zAb8jR4JeV*QLGzW%2#Y-Vga9x35GA*z9)(3|MYY#
zGQr)3R=Gcx^W@;Sb(d^CL8<os<uAFBTa!(~aaDqV3~$Q{Lp^Wncd+$0<syu;Lb7S8
zLClD6;&(atXLY<BY~*<vd4!#woL`*1I=DD(7fX@?_%G%eww>7V{;pvW!KSy+v01;G
zN7$>TT6|?h>;2Vob})l95J~VM{PzN2WJ(NO1I%OOYA8PzHU+=4;tF(j)w>d!<NQwW
zPJB~W_Y%i=XMU#EKmYvmVrG^P{?cdXv@XcVsG~A9op3|n+FA><8?#%fYL!r{Wj*Pi
z;dhCK)VGoV!{sxRLDr2tZ{qsJ*jhAtH*lo^qt?E7kj8py;Gd#x`L3)dlJFkd#$gv1
zgd1Cq$MF9emcN{k+OcZ>@{$(Y(1r2Lb&KlA`NLm-z!}SFf;1k0350*6pKT5Gw~iXL
z5h%>D?(<%e^!SuT3aDoDUF_+paydC;JVweBf=6LwPZDW736jjt|BB-yRi0)VT18}*
zTB(}8K1e;Fw-2)w)_jlY02%@<>r;yAgeRt4*j(*y$iC+C>%;GWEdEW7{cNGm`{z5J
zikVj;r)1`@JvB2>Q&O^0tULFj6wX@m;e`ovspRy<w!`1G#`wKd#pZh5-45z;h&1+b
z^v33>QS)tzN-kkFY@7_k5Rt*rvqzb-^ut<ZBh&2HY5LqZhw{yKk~jt3?PUl)G?-Lf
zq>INMR1h~RjMq-Pgt2qsP<aqMLJoaDNd57fDGz{wp%diEczu!aKC+hK)ydZ<0N}7r
z%^<@{YZAP3_Nyry)OLlYVOA=bJ`TR#e}0%eUYcm~^x9=wnxE!@TVk|j1SssM&896f
zK5GxOzTnIMeFCcSvy0s~>!h^xlJ<tWH(r*miZmuJCHczGS3^K$ncWNIriXeJKs|bi
zfW35B?omOdF+Vxt{O<Cf1P0O06$c@kd%;(*t1;c=+N&q0Ss!hjBL1Xq;U|FpL`#x>
z6SlJ%N(W4`y=xD&X{<%P)a!uMUb*z+Np)k~sJbN-hi8W;2ZyMyGI!gx(ht0yy8(4P
zpHVOw<{7ixt+hsXW>d*hAR=zprXvfQLTi)-=;?>XFw5U#lHq!KQ!Q!=9tqlUniw*e
z9ToRjWm3Pb9P}oqVTIFaPTHhOWx%{b!1U|Y#Bgzl5F%`2hA#5L@2T|*(lp^)+yL~e
zS)Kt%oMfrnD>`(>Ucc$CeY&@`ytz`ahDV3rZssGDrSg%i-Zr(@pS_8KA^spTbGwNS
zky=z=+3RK)aS;7B^osj^I!ot|Z9>Zo(>2@PlOLb%zA-1MjnmB7Ff%9W=^Ja+{7sls
zt@_yFsAf2{Bs=$h+?;O*QrFmPuGOmfaT%_xxxxxpGR8k7!!_s;=FNc9kTAXR{+5J|
zbmc)SVB`arl-SmbM!zXL2ZuixSPra)u|cI!fEE6d1@4FsWS2On$+o#Z5wfr6g!3S?
z>smS7`Q-O7|A?7KChU!ZtpgU!i%`r(AhNSo?1q)=$`sCh<tb-)*>cJmVS6e2t)aK2
z**#3gT2lBq0fk;x%liuPnwW9==2&yLrHU&3w1}D(bsF%~8bh$OH~^{;g|TwW7G23U
zj_3hqk_&KXt|281EiLL_HtUz$UMd`dKDPLh8VrV#8-!Lu4(O0=I~W<s%p1%(x3%dw
zKo4d6^?Riwf(~zJ^I^$|S;ihqF^_}*w97In2_{+)PF1|&N{9^BtS-4?|0U#uuwmF>
z2T+KZB-aU0gx4;&6zp{mI1IE)G1#U*mf$$f7Zs8)modQWxKmGAJBbVqfZh_~vjdwN
zxe-b&B@NK!kqkx0qN;(?qN_CY2a%}^0rQ7SKD6dXNYzXoy3_34h(LinwED;K?80=_
z=??L_drnZ)iY=*{Q?;xk72vil%5n8%c6DWG1Dg7#HSk5G+lP!Hvn3~ChBOAN2Kbs5
zsW)QsLJ^pSUPndA&(ZgnnBywLNqh%$&^emFyZFkmH80Oj|M6WSVKct$19><2PIC{O
z8kfwO^0&%R+{FOU4^!iak)e6`K@U-Z)NHt@+L%OHeFNV!8O8q8OZP1_$t4sXiZaCO
z*~?N<+yUReg2XLk4PlO6PFIZ8nO%@@KwhWA1FlCDGciUruJo8v8&)t>baBkUx9sOj
zxSDWAoT^{SdhMKoi|~EjsI6`wquUN)8Hz<HUWe>n?}{R70g^|vBm&_*3WmUWgTOBU
zPjNw6EZN*6IZ@yuhTZGq!)ZyjpvzwGX&MdOnai$&<Qe#Cw;=kk`g~A)R@>8yY?wql
z-*$BH6Rf)<Lz+kQD24tdcI#3E#Qhki(azrv;MKEVhD*_M$pZQOWTq3*+*b>A1)7;a
z#>33$Hb~3K%fOx|U+kYB7XSIzfakv_n0zyj)wF6x<-8hz<`<z&Gh3t3^;6kC+tw;a
zE)pQqm=ZmFN7fO1J`+5;Jv%)w@cQ7UGAjiaNV3Ia$}vhx>up*t7FcJQSGP+E8e3$V
z*XMJKZn(x49l533tQ%K;i%ya+c(%V-n$-hHwnbb`crX&c+cM(ZnM?WA=MS%1jVQl~
z%KF3bJsYjBU&pR-G!dCkYU$<6SPK!E9oVIb(%$*sc0z)eeFi~7&V3i?w4*(8b=}`G
zPmIeMlpM8u-|_DCd9_^41JESDeaPAn%)AAlf!Yg5tMxYsW*eT;5MZb!VM^{^Qp4`s
zPB&_8qw|_*HF>i-YnluXiDp*7x}koJ^DG9-9@s9T!^&r!GOKlx2V3#%nU#(7$ART7
zJ4k!JGQ(8oGgOLg!rSyUuR;>(OSlm)XR}GC+uNv#oa!k)BYBoKy{tMQu|Hqsl8t4X
zwUE^(7iS>j9i3bZPQUp3;lah==vj74?6tC+lDMoXfiz5#z@bS#8DMs~_B3)VOgCJ`
zDevbnm_-e49np@%>x21Vl(z1NyIXXsW2heUYd1`wY5s?vs^dElqreFYeY&O-Gm3r8
z<AKIVbXdEjWP}b=Jv@KV8eD8}z*2MHgx2JljcmNAH7FjPynJ;rI5<7|=J4!7Wy@Lj
z_V{gi5vFT`VngJ_4DDTKVGs4W`ZZI~gbC|P@(i!VeQ?y#S5B_&PEx6yAD)>t-nz#-
za7q;~jl@%ub7t6!9m(Q!TlF|m?C9Axfmt48fYNX2iVBz#+f>hAK#bFg@4&<F{K3fQ
zw|x#nyr^%h<9afwn>fu}CU*?Ik&<31l`GB;5Ut*ccQl9n7L*P01}2+YtFFo?8Vtq4
zZPY$WaN}47t9%L7HlnUMA~{z?)gB}tL7&%^pU){An12-<wThvO&u}d%qn9X@`Ensn
zF$_;sGwe4F`ezgsL;Va4u7q63U`2nhbvq0<3Nt~Ulv^UhkXvn9Nikpd=1(V+DaOTZ
zv`I%EM92GT5L$)sHEdq!n-X)j9#0;g*bT?gmVG7JFY&DDOp!XfMyN_Tz^r(Nk++HQ
z<Yk4MEP6R>Xic~|WC%ek-|y|HqZn#~0|21RYOcHg+W#jSi7)xKv8I~m%b4aj+}FLf
zEf1C;OAA2*1&<RjRq83(+ZnBz<?I$7uALx9ATA%6MNC~p;6;WEgFtFS4IxJt`RzA`
z;c^A4X|poyoI8u6HRYIw+j5L?6L9mfg0-}RUVBoHmM){U3VnbN7~ZCY#e)$S?gTtX
zd<%5rsmjrfSafe^;}s?rLsk$;j|`o-OLTvP(1lWAW1s@~vNFZ%*`gbhcv!gI;&jdd
z9uz%8gj4v|e(_9ZHMI%Wz5QNI>mLD;h?2*c%LXVH%X(_24-Jr)ra17C-;n3I2%iob
zwWl<1bnwTg0c(}6y?WeD(Ea_$i_kFJ&<)wyOONSCPRQow-3&%ReBjo5%mLqXaCT@|
zwSjoewbqLK;*iDpHJ#s(6}4W7k?&iZyIUO2mE<o28{ryu6CHrO`pSn^4Yuo31cWGJ
za)709?|a3kW5eN1!Lr%phyq>`tdD0O0?LN2i6PqATr0!0U52tGp`~4CObW1;T#yaF
zE#5@2YrcK~<dXE|#sSA3U^=0K`>S7FTbjL|hOeLY{{796y<E-l8m~r(^fbhZ;4Wvr
zPF^)VyM0`=zqz0<hDk=h9b4e@q1U&pMV1eO@Km*H;7b-nrbb@tK-Y2P4P^u*y0J7(
zm9RpFKHCPBUNpSs`)6lCs><|5<Z^V)x4z--xNumdltiP03=`fJwb}jyD`BO3aqG5D
zJk3Av{gS~JIi!FM2#8J=EBSt5hn<I8D)aBTsnJk^&850^o^sG8N!RNqC4COaJI4G?
z-!Jm{Fust%KzBjR83Ow@<Vi#cJ0$C!x22drv`J<U?U<e)&=Y;~K>P${(Q7h~SB7H_
zH#1DmyQ%mKKFq<KBORchG9BAYM%L>bdeN6SFPDKYwKRcfl`)TGa`1WVd&FBOj%J4c
z-ukHie{>deQf)ed5lq{dk{6W;yy^-0cWdD=meKyn&$vCim=5;+53b9u{*ex?ZpL%+
z{WZ?AP6;1c?$ls>!Q>Bl;wgRhALF>ou6)=*H(YgAN7~9aga@G`ehSsND+V1>nv(j!
z>|sj^otF2cb;pwEnjd0_wU<BTwfVNH<{3vypV3TFkAb=>Ei?m^t5&0{Zn_a6RQfcO
z^3ib){MR7mtEYyw$<2xm#_Z!g_e2o#(j7V+jD(gWHQilkN(f5o98heQzj$VI@!){(
z;gV9IoAWqXHT7MXZE2I$>xTnfTf-YQmWFjsYTpustu6IaXq96E66%<fOX>B$u>%iw
z(N8s)_jd6;fHHS2<-4=Ny~H4*^*a!^oql80Jyf}cdRpJ&-<#v|Ej_iQkl<OG$}XxI
z%@!PFt8K`Y_7+DCfGNk>AMM4El-?SknniK9%T2)<j|vv$tFgp4up*L@mtD9X(1-te
z)aMGC(vh;VS@I>Xr+2frG7xf3Kx=h<TVp>9`er$s6}JGHKr?mkvBvz!1YN*Kt7WJO
z(#s?!4$rP$;Lg>es=7rG)f;rA_AI4E3(=0?1Rzi_?f+eAgyDI)1a&7noSStrbRM-|
zgzYXU!9l!|>F^nS1CJ4+9?ucu=5n@LTF4{tB$&phgjdCMzw3oNTP+n-RPbU~lyiJ^
zqr38!MQFDmGLI7;(=^ZB0_=`Y_&4kQeN|5Y+m2F|VsJ}dW3_W#Ex-T%(=TVU9dkGy
zl)B;g!ThlYKdFDeC<!wrqT!ba!){7ou(%8mj^S;&9Nn0KmDeR453>ABmo;QFudk=2
z>96t)Ko>KrM4lN-N!f!zHF7t=xVb~WoR}{=W)3^?I=K^oj4qu5+iFuIXqDE^+xI?F
zAU4dj;PTKPWTW_40T8_Dgo7lYHOxTawFH>`ki~H9^_pugp{B&I)MTK#JF(#SF0~Od
zQAM@g`5TYr%?i{IQ7@rb(9+N<wrT+k2<OFpB^YYhXShU!8lJw>ffG^;_MO&b>kF>$
zw!PL&7ybvYt4i$zgkn;zrlT9HlnMBWrbn*NpTRS-Fn!$Y?d<gE-rL`R6wT(#X7BYI
z{{bD>)nGO?Onz-6WF%B+(3sb8m5N!>C5(8*6l+`>>OHOhPc@M5nR1bz9}We644!bd
zV}ib(eE|p8i`i;!19r%74V<*)Pow(?P08%h3tXlkLQ(8{IwlO`;sk4rLD1Y`ReYp8
zw||Qe%=+r<&xahf!T}QQ;nbw|ysC=bD>K5~p}(K0*8LKu10cNMRp8p5QkX9A%S7t&
zeTNRxfg*55C_n_wlxD=#Nz7cpl_AFrDjM9CW?qem3LsD!NZ9k6>K5#iQXYwU#K`jL
z<Uy2R1D!Dw@z=k@x2D@uAa9PrXMYXzo&U&`4ZYUL!ayXR8SL~4Jm2GTF*fohyXZrT
zmn&gXa9l1-kUn5Xyuz1$Sqz(prf(S4tNPuL+&3^)>$D;xFAme|+G}Yw9qTv>%f0h8
zq-on<?R%Hmg~aO74c@i6H-G?F@0N?w#z!q72&Az<Qbi{;Cb{mJ4Fk_XyPqy6_|O&{
zx>v=&w!rugWl+O@H>UQMutZ`N4o;50+)8{wOfBU10#?gx9dDwPhISiD;ZwY=VKbr?
z3+UJ`G`y$?5{SNW$#=1Y|B#y-fI+AorVxPufi<9aFS4_kHT`jQH#Ki(-^G++jOl#o
zVrS`d1)7d=)r=PP9P#W*^Tby8$}f;Incw=wx3Df?X2FvPVK#K)mvvw+r`yy>a^YB#
z49NCJUl7ygA8SG47YmQ7V@@D&dUcEJD7buCHphQLrIj(hELen`+NwK@)D^Ga;0Q;M
zLvW^n@eo0(oRa7lEK5^#Xt8^|O|6e}^qzN%+xq$jnP3i-oL7s<14+5#fSF}aWGK5L
zz(x$Yo|zZ<mU!uUN|Dt-j6l;0`;RHh!mS8r(L<kQdbK7cDm(FcPg1@Ckx)K@J*LIt
zDEM!foZ+j}<HNsP*3-ZIfMJxr>z*rud?rO~b22PEk=pT)*p!%-XE>FBqlNoZJI_Ml
zL3{?5&j8lv4QJ0HdTda-n1HW2CA?t%9)6=^lx$8S(hYBLZyrLrfjp)l#2hYA_!u56
z7n3Ike>p#TesppH(XXE@i3MM*=1bSk5sMrCaW<s8=>>E<Kmou7HtS*z(ZsOyJ(eXw
zbi1tF!9s4QnFY65u!&%SW~=L)XnPc+80?7ndqVdRjSJ!%apv@9lJ}|47fdlM5M({(
zB`E3<l@X%)(L#cpk^Awkc(rog0OWLFPen$;r=J~=B=ZS)BZk{S*A^5aENbb+O!#4x
z@5Q|O41xw%&;S?1C{R>OMPS`8_F4rM3dUp}P>Cl_HtA`eAy3mlBnf`+MG{F!1k(iD
z2`cfQwAf;Q+E8-Iec@Nvf{pr_#2fQ}o6Zl~2<+4loT-kx*DlAs4qR9(2G~H3D%dBb
z+l+9Cs3sKl>T}KF&w?)J1|nSCo`+oHwVDXOiUYM#gr)wICZ5)h3gA_v?zR9ez`wc3
zqDPM&JuVLRU%dEY|KRK5{N>@n(U(UD0?-q<$c7j_pg#&spvV$6bHhs1j45q?A5|3M
zkN)PUns@r-M|M~U9KcEaEcs>RWUa(U!oR4Hu!Krby9$i=wjdN2?FXf7FLSaE2w5V&
zXG{DK@(B)AAlsmNl;iONwn>WbKYMm|cz(WRTth4Z&n6-Q&nDic0D)bYc414HO=G4J
zC8`)aV~w6Ui0(}Vm|3_BI04v{WmBU^<@bTLkq-`uOh?-VSpCXq39Z&Bf6e-+(ZFOx
zbR=8MP#-63d@IZYxhMQDUo!Yl6rWAu)azn^;|ObRdV{)L(y!h$6R?ymt9P4puS$o*
z&tR=-BDh!FSLNG9b=CjY{Ci$$v(w|^#mt;sNhAr`Oq&_VDdrWg5hD{2T7d#0ysM&!
z_?UZf9)MO0j^wj3qDfg-S?ej$5SzF_4K^xJCX;I7;nfp9=Y!=5N978HGB|1Ds+>@e
zK;NJ11ZJ>ToWV>&-+{Urh&j^~@zeF{0>HEmyi%5=Q@BjRH^PO71(!ox{FFwCO^bwf
zV_RZU0K^I~NC7W*HSKi!ns$^|1N3*^V~3hPg!!3cg`Ok&IQHDBhNR8rf5EY%qeZQ_
z=IEaechi$UpPwFlJvhHOJKR6k%L-1b=ClmB&raO3E1mYi@e22iY&z~_rxC)p{`JXn
zHD`XIp5OgvulRTWFFr5}ERozWzgx40@DrFDk^xEJFkS(Y$-v!pXiYC_sx*@nEc(<H
z4c&@B(h2y?+Xiw<?d-F27CB@sY|@6Mh26KogYNCwiL-W^Q!0KsKYjM|?oUsD?)sU&
z607Ys!v0D2{&vg&b5NH$atzA#ips-)I4&P9D*@Vt;FyM@W6b%l%IRuetmgmipH(iq
zXKoYx(4nNz6cS;sh}Zfv&i`VrUE+>2w?@Lu2ISUCJ;`Pye9jUGaO3v3KS3=d-qKnc
zW&*&Maj3B^nsssn@0r!gqauC?O$~{xijm>7RtsI~wC1LwFL-`7ay#B{LdT@!7zgv>
zMu(^(x4&SN=@hgi=7)}#nFNg+lbczy%s3ln<}a_aT>_xKSz_<zfV>ZCYf)Tr@1jU2
ziVZ9Z??ni_OwiK<8VUGPIcURvv9Fs7#VC!&$qpYdIMK5KPyrak5bioMBD|k&M0cuN
z<Ni1FwI>fzdBJ1%$h8kRZM*ZKw5(>qYlFLHmNx0-<UIzov7P@&62P>B=FU02qV7zX
z-8Y1Gryms+qI(NIisK46q{OS~zewF~IA|(L+lm1uX+V)47ez|h^-f^{1Pl*9<dBO;
z;6H5i>VO0h?nEm0WjO}DORoUnbNUG%v1&{N7aVNppR}v19T{Ns<NuSz)-v08B?n;h
z0G<A$s1flwZk-K$C}Co1D+^36CNoagru7`iJ?RyLG!sJO4fv06tS$4!W#ifAGgbzp
z0r)1alimzlZ;RgBr=TR5jfZJh%+>ARFUmQu-I}3?8zAdmkE=jw=#6(eUpdCut=Mh(
zwgO8{XJ>J_ldh`XfH1P(ev=<_G+rD@x~Fx7(@y4U!`tf(BoH5gOYFGc^BG3{HHc6K
z(tQOkqie`pYOFog>E0&e%;!9n&yTCxY=w=4(US_qn|y6@3T~;ofjqDgt@0i~WWSkk
z-<N1MCAcKLGr{>yuSdK!NX}a}hBgCB?ImtkBus2PSjC0&6F8gC<&5NBJP!-P-9IiK
z5r?HIdF$nXQI}cRii6BOwyGFuDAhx({!8GK<!LRo5F<Z0?YNyb9le-`Rlx@2JLtId
zR78*F-Fz;KgHZgCE3I)t#k|j_szcXCzAz1{ue<`Lpc7Ih|9o-&XW22yvYK^8hfn&N
z=AvvvX<#<rP8cZm&8klVf6dTRA+82F!2y(NkfVm6&73+(&@r@CqJ0NLK3SsZo|*Zo
zotiHyIf=3w*;}{K?uH@&d;%(GHF0SgZDf7M?1cnrxCk(CD<inDK>Rgu(sQj;@2b&?
z+)DN*2&aKJ;$D<>%NcujRDm;&YhqCW_Eu?RR+uTdecV@>!ql*!f{YM37j7cD(&6r{
z;Brbp0<;**5%39u+~ku}W+FOHTj92H!9&=l=<};M611*-QY{%F|G;N+(vW1@x1|s8
zQSVR`<l{{MAs#`KwFo$#s~T}y+J)O#`RtYAYmHgGyQ#0j6`RkR<;M3mG~lX7judZR
zqLsMRkgc{=qY&bOAO^uP?FL*wVkDWtCVDN!qKcey;>mqK+@;8vs*~WX=GCGTuK_>K
zeLL1*o3dsa5jXH(|EoYpKZxRwwlgR)cKKO#JG(<5rl<YS`o)D2YYI;D=jrVv)Kb!c
z_`SZbVbZp;HI937@CZ73!-N$^ds*=$njre*1B{DnAnfA_2j#cdI%~Srs+UZ^=?9wq
z06ac)WW_5giLtcp6){*N!lRtLM1#|=cX3WJ9K7A4_roU|9-NpgI+&SQS%#Mcbq)xs
z-|fEMee(28)C8X`Zw6`dE>1j~PcHMgqw8%Z(%#f9uF87SQ0&e_;d!y^Pq^me8Q)@b
z-pNbDyk<Q`QhJ((^om4^0aqFDl{5&ud}fzFP!iMk-ih_`?Pqi@$9CT#2b*;=fh4Uz
zJxwW8;loN*J9`>$H=d^11)i0i1yCK$w(mDiaCdi?;J$H!ySuvthv2@^;K70fcPF^J
zOK|t#791YA_xtY6dFP&U-<zqf>ZzLg^_reu-CeVK`VUVm=a>$?LGfTiyp^_Rj`r@H
z;y&&@%T$#>Wgs>rlJap!-shO<u^RGp8IAp|)ptvoVMUFcZPC^(7x`R_E>Axm=P<Y&
zE$UmnE)Do1TywM697QH>C)yYMd@aqdz(3O3^Ed7Ye@G@^DI^Kk<aec1yZT|ze|4;O
zQA?0TsNc%Vhf9$Dbqs+^D?4LPDiJ@p>c;xP4OCOjH~Zt9PE+x42uE4j2V6LDCnRdE
z9qUSjV4s=VkQq-ep;sxs)}w$28cnj4d~}t8btP~FN-_Al%_|m4fS->~VA+EQbM2sg
zUnaw!Fln9^O~Kf88jKYj>~Mmn-2;Jt>>Id`-aKyUBvd`6KVh{OWO=IedOdoh(y(aT
z%n0fG`n*Jb^~13O$8##URE|O5vLIpkx5qxJ?QAgKrk|)xFx8KDdXRb5N%nNIEVg(E
z&nlhZf^AsmaZ+~pQ-DE-fX*uh88(^<V11lKoNUcPvIY;W5saP$nd>=HFPZx!0D?^g
zb!4pCAGOA&{T$Vgk}UEz`dn9Ea<jX?**AYc4t@|!W$*BT+^o`@<FILP`NPQWF1@)g
zpkG%ZGb$M}F8R7Mt?BzyJMdlvNAyn2Fz72wM)y>$vG`dJwZObHK8Vfc6<op=f(SuN
z^C&m`%if_vCL}c~^)F}=&s(^T0uJxD4HRAYzABp(n=fHUudhcZ%OQ{GFc((`5K{Wm
zGfl3iuYM>2m=2Xdzpdt_O}kw*YP1~XL+kX*2ofQ$YOpn&37M=#+qnbtGNifvP)j!z
z4|dII1<gwUfF8!6SYA!ly3h$8ZSi?k(2r*CL9Yj6+QJk`MNNP(2ra3J_5Sii+ItK8
zE|B+33`Io5@NB@w-2uqjw5w)v#$R+t4Tax~zut#oc|Hzd3Lm!gfn15pDeo~qCq2}B
z%fUC`7N`9ffzxg-z=$(bY<f3)uqu0p2a%zxrCE8=(5?&!O=M17%8$~~oe~xrAV>0T
zfH5?m6`WOKT`!3o)0o6=WilTt%GMs{lXZV$my;9~DBel+fZodN-~q3O4<AW-G6?mV
z9Qf4Psp8*%06hiPz=+_!gGxWZBFjNgp3Ajt?)pxb2BtasXllCN6nO#Vur_C`N1v5I
zBO&H#EktPHTC_><RH3Yb=-0{$D;;`!um}n1b`o9%p;J#mA+=$qc0w@!DDB!_TlCc`
zz3;=~>s6gsBgI^=#c6X7!bwYPfORNNG@S|)DtN6>=(ovF#uQN)Sd<{vscfkR4<4e&
zdRt2PXIbD`2+ytr3ag}<(Hdos+Fa3cwbX4O|F_V+LFq`+u~xG8Ey2Qw>Jsg1{#o-#
zbBnP;ynIiW2gIS=eQ+de0Z}}XzxF4*`(`u|;nU&5W?ZCakZC{T*75R&8vY0^SybhA
zD21s{v5WjBAXu%UauaR8q?>eWi^ue3y@I-?vkz|EOUI^x(ndr%2oj3WpP}s)JQXf^
zBt+lkXfJ-?_G1ud0QUlP3l83$_~I;i6c+0&%I>}7b>yaf6wvmxaAzbQ1Fsw5lc$H9
z$45u*rn^(hed)Bs#+2<Ek`xEit)4;Fq*q#<5*uh}ZtC^to3o&+%2<66($T^7j;_Zm
zuBm@ZDRww0FeB`7<j8c4%UtagtWG|yQyJT$QWMp@h;Xlx<)}G9%$ti~^UvLVy-1zp
z5upo~5zqjDIUWE2<!=cX$Bzy!4sQ}LfA8$;K_8?0KS;mS^VvBsNlmZc(C;lEhmDI>
zC(N#4xv$Yh?FMHc`T{JtlH?+U5wgXY6Df|Xil1(evbe^ix;+=Yb&LJk1y<kAXHL<=
z8K*rR2{%u5b0e$-3Y`-Z33Xe-{4x?s;>h@-sMF2Q1@tc<Owv{H!_Z^+2C|VLXK#-S
zZSo6(PFzgL7bWEyh1~>7ac;S9E7NF_i(Z5ws4_A&;~#5`Xh*D`OdBWMwYoBbla_78
zMeqiIC6dXH@hi1YKz}xY3sSm9ih}#l&n{mXwDjXn<3liToc(7Ya(LK090M97Y4ZKW
zooCF}`i~;^cYROdVYK5|WFjs5J{s~#hu~N$eWc2uVbsc+PG!)-vqw+!6H~%iJ`O0~
zS(1*SFc0}YNaJc~tQ==62Rcxo>cyoTC70GO6>v;<^0L882FIs#Qs?ES1=EVl9SzX$
z(&!fq(MhxSx7PsC=f6Ap%u)c<{?t+EDVD<Od|rKksvb%-Y?R8T-95@ciB97BD_1fr
zGb@u<{_822+8Q-BJB++>GQl7v)Xij(Bd;Or5x<NSO4;qMpoK6n*ED69a<Nb;wOkWL
zl9CQhYRc?$-(x%!csDAW61q@_Vl4=*7kH&s^>hz4h>mhy=1w1awxjA14urKn&G$$Q
zYUznY)=#D6oAn1A@It<9MJ*%8Vb&bcs&DS*n6ghCZfeB_ZLtAvfVlOQ1aQP|Fmdt^
zx`;<sX3(Ki2?qCKcgV7ES$YFb`p9btf$3dmU6%aR`#Xz^k$&xWyY~~!FSL>pCOns7
z$j@XJ(ft~E7nUzr(0q9qZsw{)yvCNt9R#D24xixnJ`h^6^v|2{o&v~UY`L3mf;LOL
zFxFo#U|OFd?yMrx!RI&bw(qa)9QnX}1Bd!-{J4=NQebu5GB!>5L$ULQxSKNN?Vk^y
z$`R4s2n>f_?anz<?`ODPW{DaGc*kv#o{+&({5T#6!CsD?ef2xOJ25?%cCFs<`=5Pu
zd&x0rBSg0U^a=R|bh;fskwNTM=E$@*{w9Lz=RGoDgvZeD_im*~Luuf9R+Pcd{i7IX
z)-Sq`LDjzXkj4oukv0Y$TBP@z2j-6D26|g~1>_JaQ%;5)FSEqRrC&S;aAgat9B;r=
zGw+?Bo1okzYJc*aa&ragJWKJ7-<>w+rnzM-cby9brC^2-((HS&lwu*KHC^miQl&h|
zfooEAMf*8#I<Hy~?vI(OMz05@?!CCMd{zCp1jujn4>@|g1-)`$|0=)EHDuEjF5AAF
zISe*=opq{!c1B6^y`nC=x97gLVEe@THuG?wVRd9r?@~VBU7r;5BV%&}nl&-QbCAZ3
zK;eXJ1*eit`s>1CX;Na*=9IqT!x(y~w4;8$G35UK9k+A8+#{ltC!p;!9=Hk%mbCy#
z7P*!JY(R~>2dqdsN0k$gtfBO?pU)zW_BP5f1IxD-Qn(Hz6w=P)xz(qSm&@FW-bHU;
zI4~M>AEctl4HM@-9{m__CI}0WEs!gF7t?lG>3~nJLx8HTN1V$I$#*?A_*%|-2L|)J
z9`<5K&&Eg4PQa00wWXY@@Z2S5VjF6b9R${FrF@){qa12@PI+9vxGHY4R-oN;3@c^i
zuN!Nq&<NItIax1c4`meEWQcg>CYGZ+m{@Ltm5dr};mlDa)}GpO;n#18V8{JSdX+#q
z%>m!g80^x}NdTZh)iC7r%fI8Oh=y^zX>*K9g{gPDfaj*`Y~qPCTNlELrxB!;cIUxu
znJ>;Uir6246)q+k6%!x^ozvn(Kd5c}lg{{J7%gOBcX~&gQ64zDL<$%2w#rY-9=~=a
zFXhE`xi_gg8s_TW@a|{PC%`j3))~Xjr-Tv9)%VP$1Oe}3P5R;a*1MyswU_vBVI>(O
z!yeLgti(@zQH;!A4C&P8+w}@bAV)zldBc>YqJzaL>`zi4Leqq}(&fzxX<4`Tbx_gS
zYTiyFVx2(;*`f^yRCE$PgyNXScfCWaPrvMm&Mo!&awN^wob-$xq1~@E5NQ#3$uuMS
zI8_6`uUvzR9y@-F?%Iak2#vgaKg?L3hrA!UW=(IiyC)t$4VOTJ^M#6NnE^I8R}Owu
z0NX#2lpE<B=$r-Sv$xoyuJYBzB#l8M@zc-GqF<wG6Tg%7a5?wD9cT4>_Y2>1F^A%@
zoHg$dptpXS;QiPz&c0k=>^Z#~TLH=CJlcR;b6QRP<(>U9#o+PDJ2lV>G}F%tV2U6I
zQfX_G-6ik1WpvtSipH^p!q(!>z(fHc)R!9TU)V75x6ONNZTo8Q_H|!YQmQO0-Xo4p
zXB!@ts6@*?=ODYr6O3vN=6Z*IMv#V4rq0}*q6&mtKXLw%vzJ+;Co2lg!Qo8;E1G>i
zo+CC8({l;n;C+Y#A3kN&vTz#p27B^>%jZmKz_ZA82?OUz2!=9E6RytG@Yrd6K{}ui
ztDfB-15+9>E(IFKWqZD>S0Glj>IGLU5A>5bg2VL-@Z$;67VJY3)rS_jNy2E=Eo+bL
zGe(w?xN~U}y0#4v?%DyTI!i@Yxp;rl^a(2x{zrBNtaLfMoJd89IHPr2gE|KERU5{U
zS~B{t2&xvGV~3=Z6xLEPW;KfMDIf-5995iH@JTJ@^7JDti!m|1w3}yxiRY)zf#Br)
z9}s63$0}2LIu|bru)}Bq1Ul>S7rt{2rFB-c_yb8BF-1C?B_qoRkRHU@muK2=;+gfW
z{X!L{tPNKnJ%S{?+ET~B-HM~;XwQ=>Lv&6WX{RtBt|5;j_U$2`-U13ZE9GP#?CBVw
z2r3PBy1ZdY$SKnIu@llmcJo2=IM$$)D(^vf6*uG-UJLe>zzX0iDI*TkY4Yvo!7E<W
z4}?>0zjhynm}Bj55s&FK`gJS=96O(0f!xZhYOP@EU8vgK0G$a&4~?Pz)WEiU8a;P9
zPT5Z<h{e*ToVk@7_e<A_1+g`(IDDb0*G-K1bWM!+VOA*0`M<&glyC2w$B+urW(&9-
z&Cxi{waG8_*y+S7lzqv~;T3mTreZH?i$2Yz&$7rzS6N27tskzLyB}Vbe-}HlYp`s=
zYkSmZ#lQ@(Oo84&7b0w4X=HWmNrLa_9SHu)$#08`9DpZHKsp%xF%rj%;xcu<2k9|+
zYs1D%A!J%rL;t${z6)^oLp+#C(8x-QL8|*aywj~NE!8D|1Q`MU4A;X9%lDeWfG;c&
zsa!MzhQMt4vct@`C^7aD2&)Jjot7pbV(KZ?bRDYPQN@`kj60Fux{9Td(Q(ofeoyKQ
zC<b!S+*BEvk<45I!jmy~iC8f)j%yeM_pK_`Cs|~=f|B~l@}q!Txc%avBeNll;#`sv
zZy7cok(RMj^KEel)PyLy-PLg<s&nzUNM`#M!J+WswmhohKlinxXS!mo7YVY~qS7B7
zOCv>KAd@jLLZyA|xvt)(p*H#gfGby)w~Mb}(S``miJ}}_7XbBJD}G%-YF*Mc$RjNc
z+cqu3x<yU^lWP`}Rs|idPxFnKpzsCM3HzyKZ|UyPRI2N|hI#g(jZ&tz66y^rA(f~E
z6H`s}%#c5@Ad$)+!>-Hc!2w*umd#b?J`8<eV`)IIn&^D#D5|qZ@d3GKBzqD$Pt%+(
zSqTsoYKOBUi_51~%r2kc^L7jAQyX?_i~0m?URzQe=y>;TZ9-;Db?^e!O{_m~S?kUt
z704rWgF(OB6biDObImNc6$P~>1(Ls?P6bHRq2`gL98cGI0C(*yhlmbHVNLi^V8UWy
zZl$8bGKhq#1o3>7$tF>>S}S3NZp7E4->VF&Y1SC4>C{=p>MUx9kSIZDgGp`79`=Xs
z#%7}?>Qs?5q6^ZnzN1f(yi#{Tr>}rw!YHGWKC<B|T&lsMNeu7hbMg)f8>Z+I0hbV8
z`*v<k??p@HWg0J{l^zHV0%ukl)Rm$Or554QsU_g#R>7tZCh)@3AlMWK1)u6B=-I&q
zs?<SZ<f6`eZy;_^YT~d0Ka1npN~Iz>d=VYxdxSZy4Ec24zz|bg(c);t$e)v2ZE+b8
zoX!y5d-15XG28wbEYA?ez4wxTWVhdGJ+qzbvYlgsa2JIFWYG}YsS4xPjN|A2YYlJb
z5mn--ZPK0ls7>tQRO|KaH?}2A-t+7W6CU#x%yi;YdP_)gW-5P6bwL9ih;Vo#03{z%
z0t3P=&sqnV=YXsp1ld=Gj!%M*m-ma<$jj*E++y?jrHpVtEi1`0BA3z8sx(ZzN2w-W
z@qKf=0NAUq<j+q}BeCZdnw1_7m(OZh9hvJ`(8TDrI>p|zXj69Q)mJ|=avM-icZqb0
z!v%#52h^gr0%6gv&C*STOR!=JG8j4*8p3G6UviiUetj8mzV(fiNkrQ>oZz3D+@_05
z?6c>MJEaR$r8K~#SRG)U2&-pf)LoXu2p}Q(0T8iw!X;3)^Uw|5)E`}ZC7(B@nLEDW
zDm!3qCMmwNw?p+Zfxp?}U}^Q}S6wZeW2l5Hwt#K^I-F55VDeMmhEC-KO7MHJ{2<^g
z4^wBzQn0I`AHsf2mrRU2E@PqgL^E44D*<o0x!&fz!BBJBd;p?c7C&iKF<;h7XR<U=
zN*JZHxX!H`L*z^w+e~mLj(l}RytaUk!b?Hr1iQAi-f`u1Z}IX)vPFtJA4ck)nVbw>
zqZ38p5Mo5jaKnYlONB9tCT9iX2XLNc;1YO$@`(&f8IaP&Q==WYeR9ap%q1r^x+6j1
zPtW4Ii0(`_%E9T>@XZNAmQWjEiDjpgmrflgi6F%jUIFQXj2;dAfo^_h;k{F}8r3lD
z?jr6b=`sYVt+5T+#+{&HKn-)=!nV8JIZ?Q!3$7Md#Q-^5|DbVKrP&q*iZ(-K9Y!Qn
zWdKESuG<f_h&l!6T!&sg_%hJX9<tAv!?H4~Nu|@WwHjO6v{}i7WCmv{PD7yv^^Av-
zR2ca`NsI_M6uw1Fc3WzNFHiYX&I@<jr@C$*kc&arbV^lkEQ^?EtEzR0ln&Lm1=Phf
zZ(}eOoXR1R2FhtfR^YoNF8wO&*i5o)Fgg60;m|~mHCE_1mu;gMD!{e3yvGol&w-!(
z5H4Vo{z_%TiJ-0?L!c4)xpCvQBwz%q!fdC*4);A#5^8jw5Gz=ZW=;0vdlhSLNy(-I
zFmIo=u~X+G4=W7M*NSlInQMeN18jc`WmuYX1X}g2fqY<p7pvN@(u&<eEv-C;<4YVR
z)`uez8d8|69|CE^i6?~icEY0fgIyr7UEP;E;EFYKi5=Rs>W5<bhNPAaJ_n<Ylev#!
zA-mw;%x;Gm2cd2SgnC!StUx?f?2VzblTsNCgjy|G*^fQ(puKkJ{Q$?fw+#;*>mb`7
z*qcS`^i1Bg&s1XRuI6E&=i28g#lHA|5z-{<Gd07l6;p-0+<`>(B%C{v2t$RmtQDSQ
z)NF;G8RZH&UOG3>NsT~%!F(j-Mjum%-C&=e(RB1x8`{VoxX3Qu>R4k*VP6OmoJQlr
z=xo=_mUym*x;zIn?-^?r368;`NCiFrq>X;b&u=)tpT`n3-xW>vdm6`7i3fR_7i(x7
z-!e1|To4&F4(;7CFW_y#bMId>d{FfXa=;dqc6q2YCGi_2$2VLE;jC~{ogB7#7aQEM
zOWWxAD~{6`(9uy1sRLI{*{@EJ;&^~H-)&<J4*$Wmt7U&f4T5z%rM{{JvZX<>qZ9mB
zcP!4J>2c7aQdGyK)yD<KT*!V8%e4R%CJXKqF7oCIdR$zKeBEQz<Pv!J2`MzY2FlNm
zv_T-U?$fRS+kh6$ONgFBbnRfJZ>Kax&{QumkutR<C8lG#L&TisAB5jg(3N5wqz+N_
z6-e6Nr>TR~LpWj8=j<TrNdrg$;F@>qMAe00!Oyv&67!ixHQmG*b?a_IR-8L0qO9qm
z*avHotv@2LjjV+XSau=FQjFhI7)wgD!cXdEVH1pah~v8I_iSyt^11e|*nzTr>q!;$
z{2c`8QGAO^5HHTC?HhfRYaG+1$#>js;hb(Il6-$=L8Yy)Z(f~kP~XWxRNc*XQZd%E
zls_9l`YpIm#DdARubz~0K{1<7A9t~#7#TM)j_6JKJ-9%u8Fus%7YOK0x9U!fsjTQL
z<mE2ZZXI6DeB}wH7^$WNm=j~`%W1cQ&U83hjmiXM3M#G){#$nZw`SD%Lt1Dp6aerJ
z6#zi~`>T?(mBpLx`2VIST&^J(yUdBw_Naj}B(BfBM&w0<a5qUNR}D-9$bEzZXJ%C5
ztVJu1SLJ-U_32?&YDlo1`f2=S=Tn;}8-aiYv%P+mHkUFX=W)-Y2D5KYQeUmb#LMj1
z!n?hgmFlho<{H#wCv!6)!^PpCtxU7b6?}X7qrR<YCvCze=WZG*#z(Tzh+xwJL}$DQ
z$lx+`=~SJz#_kO<G=Drz{W^(TSvtP(CDw%*XrTt|)1vgM!x#z2kGysrKW=)+ju8vL
z3RT*yidrmj=o0oCS6hmWc)gIg)f6u&&5eu&Zhp2?H(;7E3C2g2!i4(Kl8LF;vdpMq
ztDQ^Cz=$0pRs$$?SZlxcalehXt3Bb;ez#Q2XV2)puT*;d!nPIN0M_HMGR!M7-2LHe
zR2n@F4@xLBbA23evW70lIwh(!lw%q^;Nf5{f?0$I|Dw?t4PvYAwbaZDrw$V_^YA-w
zrkN&O3tf~GdFphGRw1HklUZz_Q16|NQE<tJPHQS!S6kTz)bL>XC}dG;o6~RVb!;UB
zs7N<cr9yG0^O^jZ6pI)4KKHnbv#1>S8@#++3u7lI!p+-Fg~#ihFPaaF?p}V&&NXrF
zWWamz)Eh_X0~q9RmX=J_=@U0hv-WK0XFc}~LNuA{(qzvg9`8c$sq-1QCcrrQmD-HT
z4o77y$tmbW5}2o`)9O}(^-^^AlLr|APS5)I79i0m4(`9G-@EnGbH04vUJj=5c)O4`
zS(_dTLNJl`t5l+5G>gvGF5FO@Uu>_+YezUv)f487SUE{Vt-8um++OFGQ~|Fd+8vW;
zOG3}F0O9!yfh8%*A7M8jrF;)MAZq*x_d$<{uza@H<yA8lL~5CbLMvNX_3dcIjy7=W
z`FU)thdlY@`MKtNTfVCxdUbS$1%Mh|rB4*I#&wARdaQPif5=D>K)Ik8AN~dIL;_Y#
z@8!cYD9M6?3XL!YB_be@%@t!_V{n~919M%Jt&AIo%nJ*^>{NoR#ckU3!CLi5Wc#+(
zn|Ivo()pP3-noE$N5B&HZ0BrS>BqoTRjUY}r8sH%qF%b|6nbicS7@cmx>%sRHFomg
zSvo*O&THa%XcnGHjwbNQ9?LAk`heBnVQ;!I$J7DFSSJET?UR?oKt&Kd4WWTdOOWGi
zU7_`menGdE{Lsby#M@G<tMH!`Ru|KJ)_|+POqogYW#w?Y#-Xl}ysS(r*RAXDanMMu
zbFB@#X|S@I((Nok@<_2yw#v&s(j{r*1NE=C!nYq$->;HLc0FtDtQQ4QU&Tm-7&5E-
z)n*-bnEs$bfK;@VU>2&Rx3s<q&jq==^<sN1(kD)@cq3|E(UjB7wak!k+E%*Wc2x5W
zCBFcL9S??7QxtzC;hYXsmY|oUCD?y&{g#jZtk{%3;d{L#=<{?qQx86*EJPC{;WB@t
zFC>8c740rFp6@HJ%wcgb?3N}!*a>FK_X~=;D_+kqJ)LWNV-C$hs@U-`Zm-j&nj!=k
z4MlU%I}`S|rQ)yBi0BAJcL50{0s}Y@(B=ZlIf<iE8s(Ah5OT{XU4|AHApA~uy-}UX
zZ`H}ApHEpmyqpFW9SmeFZki%)TVBa^2|NZP*d?&4zeYA6GDXhemw&3{Hpp|IyvSwh
zwzKGauWK>S(7%*7^k{uTiRpwOGqNiX=KwHBU6jI<agyKuhWvAShz_!mk+#0g@P`_s
z%1j5^^<~DmR!M5qgkF|zs`Gu!f*3)}c5-0^ab%K$A|G$_fRTv1A+(PS#yV;$Ty2do
zPWF75{2)=iXD6(`5tJRqm?qC+R(K&|J5r@tnXJ4c`*qs2k=q(zV(*~tDQ{dujSu(*
zks)&Vx|qv%b2T}iS!!#{mi+bm+9^}T?Qy@E+xAfH@$lYOK}Rby=PR6w7?)n7^TDN>
znjQ`*&ei5fL;;GNn{%0lT5nU{$`<kOz8?<o`?!DW`xT@REV1A6ztey9{cdI-o!`Xd
z{|+cnQvZ)y@&*Qf-DhAxB0xgT!o<qL#6u1D-($<~hy{fq#Dd%#+u<$!SH!}=z{=jr
z#lYb20{%tBLPZ_6i2XttR!VkUURGgOntpgfEj(6%o{^SOIyzQkaFlV7nNCScmTr84
zmhnS$*g5I&wPB4}v3TE?drw{6iwi|6#X?3|P2>-W@Bl>_FmNIWj5jl20ly7<3=fMo
z!2tjStN?F9iEo*=1MDq*3;3OL^klbYHnFubvv+Ye`}p>p|6wW=0OGf)t{(6JFvy#?
zyCHw`{%<m(5~9~WH~@hA4gf&<2XggWG5<++uyZtW`Tt<5Jmdw>KmY*KC;$NJKQK4&
z|6-b3y^YsaUjNUg{vP?kc9Q025CH%uNx&QQ&q{b9{tNpb+L3S3C0jESm$x?YpA&+&
zirArK(Em3T@y}Y6f6~dB{-pn@O!>c)|2Yo!CmAgCFY-S`#s2pf_s{d7KdBn2e^LLO
z75zI?|CNLN302AZ3;L(Ky+MDw+rO#VfA*?Bp|yWQ|G~%p)$>C^|8DH>-cIzl#wGpr
H_tpOa3@;Xb

literal 0
HcmV?d00001

diff --git a/python/setup.py b/python/setup.py
index 40b49aaeeb27c..622e6077dc36e 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -206,7 +206,7 @@ def _supports_symlinks():
             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
         scripts=scripts,
         license='http://www.apache.org/licenses/LICENSE-2.0',
-        install_requires=['py4j==0.10.8.1'],
+        install_requires=['py4j==0.10.9'],
         extras_require={
             'ml': ['numpy>=1.7'],
             'mllib': ['numpy>=1.7'],
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index 0771e2a044757..b53442ec096a1 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then
   export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
-  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.8.1-src.zip:${PYTHONPATH}"
+  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:${PYTHONPATH}"
   export PYSPARK_PYTHONPATH_SET=1
 fi

From 2a695e6d159fe49efe7601383fec4f3f172c5a97 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Fri, 21 Feb 2020 10:06:45 +0900
Subject: [PATCH 109/185] [SPARK-30907][DOCS] Revise the doc of
 spark.ui.retainedTasks

### What changes were proposed in this pull request?

Revise the documentation of `spark.ui.retainedTasks` to make it clear that the configuration is for one stage.

### Why are the changes needed?

There are configurations for the limitation of UI data.
`spark.ui.retainedJobs`, `spark.ui.retainedStages` and `spark.worker.ui.retainedExecutors` are the total max number for one application, while the configuration `spark.ui.retainedTasks` is the max number for one stage.
### Does this PR introduce any user-facing change?

No

### How was this patch tested?

None, just doc.

Closes #27660 from gengliangwang/reviseRetainTask.

Authored-by: Gengliang Wang <gengliang.wang@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index df1cbe4c2ebea..2421e00f39068 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1101,7 +1101,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.retainedTasks</code></td>
   <td>100000</td>
   <td>
-    How many tasks the Spark UI and status APIs remember before garbage collecting.
+    How many tasks in one stage the Spark UI and status APIs remember before garbage collecting.
     This is a target maximum, and fewer elements may be retained in some circumstances.
   </td>
 </tr>

From 5eb004f4bb2af968c97a4dfcb27e3a7c743f2a8a Mon Sep 17 00:00:00 2001
From: wuyi <yi.wu@databricks.com>
Date: Fri, 21 Feb 2020 12:55:32 +0800
Subject: [PATCH 110/185] Revert "[SPARK-28093][SQL] Fix TRIM/LTRIM/RTRIM
 function parameter order issue"

### What changes were proposed in this pull request?

This reverts commit bef5d9d6c348e390f99b2cd781a2471d635e55f8.

### Why are the changes needed?

Revert it according to https://github.com/apache/spark/pull/24902#issuecomment-584511167.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass Jenkins.

Closes #27540 from Ngone51/revert_spark_28093.

Lead-authored-by: wuyi <yi.wu@databricks.com>
Co-authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/stringExpressions.scala       | 18 ++-----
 .../sql-tests/inputs/postgreSQL/strings.sql   |  3 +-
 .../sql-tests/inputs/string-functions.sql     | 18 +++----
 .../results/postgreSQL/strings.sql.out        | 10 +---
 .../results/string-functions.sql.out          | 48 +++++++++----------
 5 files changed, 39 insertions(+), 58 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 211ae3f02a0d8..cc09f601db9c4 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -719,8 +719,6 @@ object StringTrim {
 
     _FUNC_(TRAILING FROM str) - Removes the trailing space characters from `str`.
 
-    _FUNC_(str, trimStr) - Remove the leading and trailing `trimStr` characters from `str`.
-
     _FUNC_(trimStr FROM str) - Remove the leading and trailing `trimStr` characters from `str`.
 
     _FUNC_(BOTH trimStr FROM str) - Remove the leading and trailing `trimStr` characters from `str`.
@@ -750,8 +748,6 @@ object StringTrim {
        SparkSQL
       > SELECT _FUNC_(TRAILING FROM '    SparkSQL   ');
            SparkSQL
-      > SELECT _FUNC_('SSparkSQLS', 'SL');
-       parkSQ
       > SELECT _FUNC_('SL' FROM 'SSparkSQLS');
        parkSQ
       > SELECT _FUNC_(BOTH 'SL' FROM 'SSparkSQLS');
@@ -767,7 +763,7 @@ case class StringTrim(
     trimStr: Option[Expression] = None)
   extends String2TrimExpression {
 
-  def this(srcStr: Expression, trimStr: Expression) = this(srcStr, Option(trimStr))
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
 
   def this(srcStr: Expression) = this(srcStr, None)
 
@@ -846,8 +842,6 @@ object StringTrimLeft {
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the leading space characters from `str`.
-
-    _FUNC_(str, trimStr) - Removes the leading string contains the characters from the trim string
   """,
   arguments = """
     Arguments:
@@ -858,8 +852,6 @@ object StringTrimLeft {
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
-      > SELECT _FUNC_('SparkSQLS', 'Sp');
-       arkSQLS
   """,
   since = "1.5.0")
 case class StringTrimLeft(
@@ -867,7 +859,7 @@ case class StringTrimLeft(
     trimStr: Option[Expression] = None)
   extends String2TrimExpression {
 
-  def this(srcStr: Expression, trimStr: Expression) = this(srcStr, Option(trimStr))
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
 
   def this(srcStr: Expression) = this(srcStr, None)
 
@@ -948,8 +940,6 @@ object StringTrimRight {
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the trailing space characters from `str`.
-
-    _FUNC_(str, trimStr) - Removes the trailing string which contains the characters from the trim string from the `str`
   """,
   arguments = """
     Arguments:
@@ -960,8 +950,6 @@ object StringTrimRight {
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
-      > SELECT _FUNC_('SSparkSQLS', 'SQLS');
-       SSpark
   """,
   since = "1.5.0")
 // scalastyle:on line.size.limit
@@ -970,7 +958,7 @@ case class StringTrimRight(
     trimStr: Option[Expression] = None)
   extends String2TrimExpression {
 
-  def this(srcStr: Expression, trimStr: Expression) = this(srcStr, Option(trimStr))
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
 
   def this(srcStr: Expression) = this(srcStr, None)
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql
index 541ff0bdad745..e2a94404395bc 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql
@@ -630,7 +630,8 @@ SELECT rpad('hi', -5, 'xy');
 SELECT rpad('hello', 2);
 SELECT rpad('hi', 5, '');
 
-SELECT ltrim('zzzytrim', 'xyz');
+-- skip this test because PostgreSQL has different parameter order compares to SparkSQL
+-- SELECT ltrim('zzzytrim', 'xyz');
 
 SELECT translate('', '14', 'ax');
 SELECT translate('12345', '14', 'ax');
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 59987b9792e25..fd6cc4d811045 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -39,12 +39,12 @@ SELECT substring('Spark SQL' from 5);
 SELECT substring('Spark SQL' from -3);
 SELECT substring('Spark SQL' from 5 for 1);
 
--- trim/ltrim/rtrim
-SELECT trim('yxTomxx', 'xyz'), trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx');
-SELECT trim('xxxbarxxx', 'x'), trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx');
-SELECT ltrim('zzzytest', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytest');
-SELECT ltrim('zzzytestxyz', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytestxyz');
-SELECT ltrim('xyxXxyLAST WORD', 'xy'), trim(LEADING 'xy' FROM 'xyxXxyLAST WORD');
-SELECT rtrim('testxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'testxxzx');
-SELECT rtrim('xyztestxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'xyztestxxzx');
-SELECT rtrim('TURNERyxXxy', 'xy'), trim(TRAILING 'xy' FROM 'TURNERyxXxy');
+-- trim
+SELECT trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx');
+SELECT trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx');
+SELECT trim(LEADING 'xyz' FROM 'zzzytest');
+SELECT trim(LEADING 'xyz' FROM 'zzzytestxyz');
+SELECT trim(LEADING 'xy' FROM 'xyxXxyLAST WORD');
+SELECT trim(TRAILING 'xyz' FROM 'testxxzx');
+SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx');
+SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy');
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out
index c30eea8ab689d..3b26d561d20c3 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 122
+-- Number of queries: 121
 
 
 -- !query
@@ -910,14 +910,6 @@ struct<rpad(hi, 5, ):string>
 hi
 
 
--- !query
-SELECT ltrim('zzzytrim', 'xyz')
--- !query schema
-struct<ltrim(zzzytrim, xyz):string>
--- !query output
-trim
-
-
 -- !query
 SELECT translate('', '14', 'ax')
 -- !query schema
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 33d1b25aee483..708dbb404c285 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -205,64 +205,64 @@ k
 
 
 -- !query
-SELECT trim('yxTomxx', 'xyz'), trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx')
+SELECT trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx')
 -- !query schema
-struct<trim(yxTomxx, xyz):string,trim(yxTomxx, xyz):string,trim(yxTomxx, xyz):string>
+struct<trim(yxTomxx, xyz):string,trim(yxTomxx, xyz):string>
 -- !query output
-Tom	Tom	Tom
+Tom	Tom
 
 
 -- !query
-SELECT trim('xxxbarxxx', 'x'), trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx')
+SELECT trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx')
 -- !query schema
-struct<trim(xxxbarxxx, x):string,trim(xxxbarxxx, x):string,trim(xxxbarxxx, x):string>
+struct<trim(xxxbarxxx, x):string,trim(xxxbarxxx, x):string>
 -- !query output
-bar	bar	bar
+bar	bar
 
 
 -- !query
-SELECT ltrim('zzzytest', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytest')
+SELECT trim(LEADING 'xyz' FROM 'zzzytest')
 -- !query schema
-struct<ltrim(zzzytest, xyz):string,ltrim(zzzytest, xyz):string>
+struct<ltrim(zzzytest, xyz):string>
 -- !query output
-test	test
+test
 
 
 -- !query
-SELECT ltrim('zzzytestxyz', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytestxyz')
+SELECT trim(LEADING 'xyz' FROM 'zzzytestxyz')
 -- !query schema
-struct<ltrim(zzzytestxyz, xyz):string,ltrim(zzzytestxyz, xyz):string>
+struct<ltrim(zzzytestxyz, xyz):string>
 -- !query output
-testxyz	testxyz
+testxyz
 
 
 -- !query
-SELECT ltrim('xyxXxyLAST WORD', 'xy'), trim(LEADING 'xy' FROM 'xyxXxyLAST WORD')
+SELECT trim(LEADING 'xy' FROM 'xyxXxyLAST WORD')
 -- !query schema
-struct<ltrim(xyxXxyLAST WORD, xy):string,ltrim(xyxXxyLAST WORD, xy):string>
+struct<ltrim(xyxXxyLAST WORD, xy):string>
 -- !query output
-XxyLAST WORD	XxyLAST WORD
+XxyLAST WORD
 
 
 -- !query
-SELECT rtrim('testxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'testxxzx')
+SELECT trim(TRAILING 'xyz' FROM 'testxxzx')
 -- !query schema
-struct<rtrim(testxxzx, xyz):string,rtrim(testxxzx, xyz):string>
+struct<rtrim(testxxzx, xyz):string>
 -- !query output
-test	test
+test
 
 
 -- !query
-SELECT rtrim('xyztestxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'xyztestxxzx')
+SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx')
 -- !query schema
-struct<rtrim(xyztestxxzx, xyz):string,rtrim(xyztestxxzx, xyz):string>
+struct<rtrim(xyztestxxzx, xyz):string>
 -- !query output
-xyztest	xyztest
+xyztest
 
 
 -- !query
-SELECT rtrim('TURNERyxXxy', 'xy'), trim(TRAILING 'xy' FROM 'TURNERyxXxy')
+SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy')
 -- !query schema
-struct<rtrim(TURNERyxXxy, xy):string,rtrim(TURNERyxXxy, xy):string>
+struct<rtrim(TURNERyxXxy, xy):string>
 -- !query output
-TURNERyxX	TURNERyxX
+TURNERyxX

From 4d5166f840e13cd4d4cd9f45fcd5f3a87b8ed094 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@liveramp.com>
Date: Fri, 21 Feb 2020 14:53:40 +0900
Subject: [PATCH 111/185] [SPARK-30880][DOCS] Delete Sphinx Makefile cruft

Split off from #27534.

### What changes were proposed in this pull request?

This PR deletes some unused cruft from the Sphinx Makefiles.

### Why are the changes needed?

To remove dead code and the associated maintenance burden.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Tested locally by building the Python docs and reviewing them in my browser.

Closes #27625 from nchammas/SPARK-30731-makefile-cleanup.

Lead-authored-by: Nicholas Chammas <nicholas.chammas@liveramp.com>
Co-authored-by: Nicholas Chammas <nicholas.chammas@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/docs/Makefile  | 177 ++------------------------------
 python/docs/make2.bat | 227 ++----------------------------------------
 2 files changed, 17 insertions(+), 387 deletions(-)

diff --git a/python/docs/Makefile b/python/docs/Makefile
index b11e3589065c3..4272b7488d9a0 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -1,179 +1,20 @@
-# Makefile for Sphinx documentation
-#
+# Minimal makefile for Sphinx documentation
 
 # You can set these variables from the command line.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-PAPER         ?=
+SOURCEDIR     ?= .
 BUILDDIR      ?= _build
 
 export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9-src.zip)
 
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
-
+# Put it first so that "make" without argument is like "make help".
 help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyspark.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyspark.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/pyspark"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyspark"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+.PHONY: help Makefile
 
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/python/docs/make2.bat b/python/docs/make2.bat
index 2d7d929755a04..7955a83051b8e 100644
--- a/python/docs/make2.bat
+++ b/python/docs/make2.bat
@@ -2,55 +2,17 @@
 
 REM Command file for Sphinx documentation
 
-set PYTHONPATH=..;..\lib\py4j-0.10.9-src.zip
-
-
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
+set SOURCEDIR=.
 set BUILDDIR=_build
-set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
-set I18NSPHINXOPTS=%SPHINXOPTS% .
-if NOT "%PAPER%" == "" (
-	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
-	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
-)
 
-if "%1" == "" goto help
-
-if "%1" == "help" (
-	:help
-	echo.Please use `make ^<target^>` where ^<target^> is one of
-	echo.  html       to make standalone HTML files
-	echo.  dirhtml    to make HTML files named index.html in directories
-	echo.  singlehtml to make a single large HTML file
-	echo.  pickle     to make pickle files
-	echo.  json       to make JSON files
-	echo.  htmlhelp   to make HTML files and a HTML help project
-	echo.  qthelp     to make HTML files and a qthelp project
-	echo.  devhelp    to make HTML files and a Devhelp project
-	echo.  epub       to make an epub
-	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
-	echo.  text       to make text files
-	echo.  man        to make manual pages
-	echo.  texinfo    to make Texinfo files
-	echo.  gettext    to make PO message catalogs
-	echo.  changes    to make an overview over all changed/added/deprecated items
-	echo.  xml        to make Docutils-native XML files
-	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
-	echo.  linkcheck  to check all external links for integrity
-	echo.  doctest    to run all doctests embedded in the documentation if enabled
-	goto end
-)
-
-if "%1" == "clean" (
-	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
-	del /q /s %BUILDDIR%\*
-	goto end
-)
+set PYTHONPATH=..;..\lib\py4j-0.10.9-src.zip
 
+if "%1" == "" goto help
 
-%SPHINXBUILD% 2> nul
+%SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
@@ -63,183 +25,10 @@ if errorlevel 9009 (
 	exit /b 1
 )
 
-if "%1" == "html" (
-	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
-	goto end
-)
-
-if "%1" == "dirhtml" (
-	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
-	goto end
-)
-
-if "%1" == "singlehtml" (
-	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
-	goto end
-)
-
-if "%1" == "pickle" (
-	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can process the pickle files.
-	goto end
-)
-
-if "%1" == "json" (
-	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can process the JSON files.
-	goto end
-)
-
-if "%1" == "htmlhelp" (
-	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can run HTML Help Workshop with the ^
-.hhp project file in %BUILDDIR%/htmlhelp.
-	goto end
-)
-
-if "%1" == "qthelp" (
-	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; now you can run "qcollectiongenerator" with the ^
-.qhcp project file in %BUILDDIR%/qthelp, like this:
-	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyspark.qhcp
-	echo.To view the help file:
-	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyspark.ghc
-	goto end
-)
-
-if "%1" == "devhelp" (
-	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished.
-	goto end
-)
-
-if "%1" == "epub" (
-	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The epub file is in %BUILDDIR%/epub.
-	goto end
-)
-
-if "%1" == "latex" (
-	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
-	goto end
-)
-
-if "%1" == "latexpdf" (
-	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
-	cd %BUILDDIR%/latex
-	make all-pdf
-	cd %BUILDDIR%/..
-	echo.
-	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
-	goto end
-)
-
-if "%1" == "latexpdfja" (
-	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
-	cd %BUILDDIR%/latex
-	make all-pdf-ja
-	cd %BUILDDIR%/..
-	echo.
-	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
-	goto end
-)
-
-if "%1" == "text" (
-	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The text files are in %BUILDDIR%/text.
-	goto end
-)
-
-if "%1" == "man" (
-	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The manual pages are in %BUILDDIR%/man.
-	goto end
-)
-
-if "%1" == "texinfo" (
-	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
-	goto end
-)
-
-if "%1" == "gettext" (
-	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
-	goto end
-)
-
-if "%1" == "changes" (
-	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.The overview file is in %BUILDDIR%/changes.
-	goto end
-)
-
-if "%1" == "linkcheck" (
-	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Link check complete; look for any errors in the above output ^
-or in %BUILDDIR%/linkcheck/output.txt.
-	goto end
-)
-
-if "%1" == "doctest" (
-	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Testing of doctests in the sources finished, look at the ^
-results in %BUILDDIR%/doctest/output.txt.
-	goto end
-)
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
 
-if "%1" == "xml" (
-	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The XML files are in %BUILDDIR%/xml.
-	goto end
-)
-
-if "%1" == "pseudoxml" (
-	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
-	if errorlevel 1 exit /b 1
-	echo.
-	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
-	goto end
-)
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 
 :end

From 4d356554a61024c7d3dc450accec1b3639c37e19 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 21 Feb 2020 15:21:53 +0900
Subject: [PATCH 112/185] [MINOR][SQL] Fix error position of NOSCAN

### What changes were proposed in this pull request?

Point to correct position when miswrite `NOSCAN` detects.

### Why are the changes needed?

Before:

```
[info]   org.apache.spark.sql.catalyst.parser.ParseException: Expected `NOSCAN` instead of `SCAN`(line 1, pos 0)
[info]
[info] == SQL ==
[info] ANALYZE TABLE analyze_partition_with_null PARTITION (name) COMPUTE STATISTICS SCAN
[info] ^^^
```

After:

```
[info]   org.apache.spark.sql.catalyst.parser.ParseException: Expected `NOSCAN` instead of `SCAN`(line 1, pos 78)
[info]
[info] == SQL ==
[info] ANALYZE TABLE analyze_partition_with_null PARTITION (name) COMPUTE STATISTICS SCAN
[info] ------------------------------------------------------------------------------^^^
```

### Does this PR introduce any user-facing change?

Yes, user will see better error message.

### How was this patch tested?

Manually test.

Closes #27662 from Ngone51/fix_noscan_reference.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index b3541a7f7374d..3d7707c6bdf96 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -3186,7 +3186,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     }
     if (ctx.identifier != null &&
         ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") {
-      throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx)
+      throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`",
+        ctx.identifier())
     }
 
     val tableName = visitMultipartIdentifier(ctx.multipartIdentifier())

From 82ce4753aafafc590f0cd6ba47d5db20a2d6137b Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 21 Feb 2020 14:46:54 +0800
Subject: [PATCH 113/185] [SPARK-26580][SQL][ML][FOLLOW-UP] Throw exception
 when use untyped UDF by default

### What changes were proposed in this pull request?

This PR proposes to throw exception by default when user use untyped UDF(a.k.a `org.apache.spark.sql.functions.udf(AnyRef, DataType)`).

And user could still use it by setting `spark.sql.legacy.useUnTypedUdf.enabled` to `true`.

### Why are the changes needed?

According to #23498, since Spark 3.0, the untyped UDF will return the default value of the Java type if the input value is null. For example, `val f = udf((x: Int) => x, IntegerType)`, `f($"x")` will  return 0 in Spark 3.0 but null in Spark 2.4. And the behavior change is introduced due to Spark3.0 is built with Scala 2.12 by default.

As a result, this might change data silently and may cause correctness issue if user still expect `null` in some cases. Thus, we'd better to encourage user to use typed UDF to avoid this problem.

### Does this PR introduce any user-facing change?

Yeah. User will hit exception now when use untyped UDF.

### How was this patch tested?

Added test and updated some tests.

Closes #27488 from Ngone51/spark_26580_followup.

Lead-authored-by: yi.wu <yi.wu@databricks.com>
Co-authored-by: wuyi <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                   |  2 +-
 .../org/apache/spark/ml/Transformer.scala     |  5 +--
 .../org/apache/spark/ml/feature/LSH.scala     | 11 +++----
 .../org/apache/spark/ml/fpm/FPGrowth.scala    | 11 ++++---
 .../org/apache/spark/ml/feature/LSHTest.scala |  9 +++--
 project/MimaExcludes.scala                    |  3 ++
 .../apache/spark/sql/internal/SQLConf.scala   |  8 +++++
 .../sql/expressions/UserDefinedFunction.scala |  2 +-
 .../org/apache/spark/sql/functions.scala      |  9 +++++
 .../scala/org/apache/spark/sql/UDFSuite.scala | 33 ++++++++++++-------
 10 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 9b74b456b88e7..7e52e6924d9fc 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -63,7 +63,7 @@ license: |
 
   - Since Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Set JSON option `inferTimestamp` to `false` to disable such type inferring.
 
-  - In Spark version 2.4 and earlier, if `org.apache.spark.sql.functions.udf(Any, DataType)` gets a Scala closure with primitive-type argument, the returned UDF will return null if the input values is null. Since Spark 3.0, the UDF will return the default value of the Java type if the input value is null. For example, `val f = udf((x: Int) => x, IntegerType)`, `f($"x")` will return null in Spark 2.4 and earlier if column `x` is null, and return 0 in Spark 3.0. This behavior change is introduced because Spark 3.0 is built with Scala 2.12 by default.
+  - Since Spark 3.0, using `org.apache.spark.sql.functions.udf(AnyRef, DataType)` is not allowed by default. Set `spark.sql.legacy.allowUntypedScalaUDF` to true to keep using it. But please note that, in Spark version 2.4 and earlier, if `org.apache.spark.sql.functions.udf(AnyRef, DataType)` gets a Scala closure with primitive-type argument, the returned UDF will return null if the input values is null. However, since Spark 3.0, the UDF will return the default value of the Java type if the input value is null. For example, `val f = udf((x: Int) => x, IntegerType)`, `f($"x")` will return null in Spark 2.4 and earlier if column `x` is null, and return 0 in Spark 3.0. This behavior change is introduced because Spark 3.0 is built with Scala 2.12 by default.
 
   - Since Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and etc. Spark 3.0 uses Java 8 API classes from the java.time packages that based on ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html). In Spark version 2.4 and earlier, those operations are performed by using the hybrid calendar (Julian + Gregorian, see https://docs.oracle.com/javase/7/docs/api/java/util/GregorianCalendar.html). The changes impact on the results for dates before October 15, 1582 (Gregorian) and affect on the following Spark 3.0 API:
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index 7874fc29db6c8..1652131a9003a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml
 
 import scala.annotation.varargs
+import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
@@ -79,7 +80,7 @@ abstract class Transformer extends PipelineStage {
  * result as a new column.
  */
 @DeveloperApi
-abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]
+abstract class UnaryTransformer[IN: TypeTag, OUT: TypeTag, T <: UnaryTransformer[IN, OUT, T]]
   extends Transformer with HasInputCol with HasOutputCol with Logging {
 
   /** @group setParam */
@@ -118,7 +119,7 @@ abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     val outputSchema = transformSchema(dataset.schema, logging = true)
-    val transformUDF = udf(this.createTransformFunc, outputDataType)
+    val transformUDF = udf(this.createTransformFunc)
     dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))),
       outputSchema($(outputCol)).metadata)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
index 01741019fb546..6d5c7c50dbacc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -98,7 +98,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val transformUDF = udf(hashFunction(_: Vector), DataTypes.createArrayType(new VectorUDT))
+    val transformUDF = udf(hashFunction(_: Vector))
     dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
   }
 
@@ -128,14 +128,13 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
       }
 
       // In the origin dataset, find the hash value that hash the same bucket with the key
-      val sameBucketWithKeyUDF = udf((x: Seq[Vector]) =>
-        sameBucket(x, keyHash), DataTypes.BooleanType)
+      val sameBucketWithKeyUDF = udf((x: Seq[Vector]) => sameBucket(x, keyHash))
 
       modelDataset.filter(sameBucketWithKeyUDF(col($(outputCol))))
     } else {
       // In the origin dataset, find the hash value that is closest to the key
       // Limit the use of hashDist since it's controversial
-      val hashDistUDF = udf((x: Seq[Vector]) => hashDistance(x, keyHash), DataTypes.DoubleType)
+      val hashDistUDF = udf((x: Seq[Vector]) => hashDistance(x, keyHash))
       val hashDistCol = hashDistUDF(col($(outputCol)))
       val modelDatasetWithDist = modelDataset.withColumn(distCol, hashDistCol)
 
@@ -172,7 +171,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
     }
 
     // Get the top k nearest neighbor by their distance to the key
-    val keyDistUDF = udf((x: Vector) => keyDistance(x, key), DataTypes.DoubleType)
+    val keyDistUDF = udf((x: Vector) => keyDistance(x, key))
     val modelSubsetWithDistCol = modelSubset.withColumn(distCol, keyDistUDF(col($(inputCol))))
     modelSubsetWithDistCol.sort(distCol).limit(numNearestNeighbors)
   }
@@ -290,7 +289,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
       .drop(explodeCols: _*).distinct()
 
     // Add a new column to store the distance of the two rows.
-    val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType)
+    val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y))
     val joinedDatasetWithDist = joinedDataset.select(col("*"),
       distUDF(col(s"$leftColName.${$(inputCol)}"), col(s"$rightColName.${$(inputCol)}")).as(distCol)
     )
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
index 4d001c159eda0..e50d4255b1f37 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -29,10 +29,10 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.HasPredictionCol
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.Instrumentation.instrumented
-import org.apache.spark.mllib.fpm.{AssociationRules => MLlibAssociationRules,
-  FPGrowth => MLlibFPGrowth}
+import org.apache.spark.mllib.fpm.{AssociationRules => MLlibAssociationRules, FPGrowth => MLlibFPGrowth}
 import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.SparkUserDefinedFunction
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
@@ -286,14 +286,17 @@ class FPGrowthModel private[ml] (
 
     val dt = dataset.schema($(itemsCol)).dataType
     // For each rule, examine the input items and summarize the consequents
-    val predictUDF = udf((items: Seq[Any]) => {
+    val predictUDF = SparkUserDefinedFunction((items: Seq[Any]) => {
       if (items != null) {
         val itemset = items.toSet
         brRules.value.filter(_._1.forall(itemset.contains))
           .flatMap(_._2.filter(!itemset.contains(_))).distinct
       } else {
         Seq.empty
-      }}, dt)
+      }},
+      dt,
+      Nil
+    )
     dataset.withColumn($(predictionCol), predictUDF(col($(itemsCol))))
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
index 76a4acd798e34..1d052fbebd92d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -76,9 +76,8 @@ private[ml] object LSHTest {
 
     // Perform a cross join and label each pair of same_bucket and distance
     val pairs = transformedData.as("a").crossJoin(transformedData.as("b"))
-    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
-    val sameBucket = udf((x: Seq[Vector], y: Seq[Vector]) => model.hashDistance(x, y) == 0.0,
-      DataTypes.BooleanType)
+    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y))
+    val sameBucket = udf((x: Seq[Vector], y: Seq[Vector]) => model.hashDistance(x, y) == 0.0)
     val result = pairs
       .withColumn("same_bucket", sameBucket(col(s"a.$outputCol"), col(s"b.$outputCol")))
       .withColumn("distance", distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")))
@@ -110,7 +109,7 @@ private[ml] object LSHTest {
     val model = lsh.fit(dataset)
 
     // Compute expected
-    val distUDF = udf((x: Vector) => model.keyDistance(x, key), DataTypes.DoubleType)
+    val distUDF = udf((x: Vector) => model.keyDistance(x, key))
     val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)
 
     // Compute actual
@@ -148,7 +147,7 @@ private[ml] object LSHTest {
     val inputCol = model.getInputCol
 
     // Compute expected
-    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
+    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y))
     val expected = datasetA.as("a").crossJoin(datasetB.as("b"))
       .filter(distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")) < threshold)
 
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 65ffa228eddec..2a72284c011f5 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -74,6 +74,9 @@ object MimaExcludes {
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.KMeans.getRuns"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.KMeans.setRuns"),
 
+    // [SPARK-26580][SQL][ML][FOLLOW-UP] Throw exception when use untyped UDF by default
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.UnaryTransformer.this"),
+
     // [SPARK-27090][CORE] Removing old LEGACY_DRIVER_IDENTIFIER ("<driver>")
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkContext.LEGACY_DRIVER_IDENTIFIER"),
     // [SPARK-25838] Remove formatVersion from Saveable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index d756014c393cd..7f3cbe31b5e02 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2017,6 +2017,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_ALLOW_UNTYPED_SCALA_UDF =
+    buildConf("spark.sql.legacy.allowUntypedScalaUDF")
+      .internal()
+      .doc("When set to true, user is allowed to use org.apache.spark.sql.functions." +
+        "udf(f: AnyRef, dataType: DataType). Otherwise, exception will be throw.")
+      .booleanConf
+      .createWithDefault(false)
+
   val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL =
     buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 85b2cd379ba24..c50168cf7ac13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -90,7 +90,7 @@ sealed abstract class UserDefinedFunction {
   def asNondeterministic(): UserDefinedFunction
 }
 
-private[sql] case class SparkUserDefinedFunction(
+private[spark] case class SparkUserDefinedFunction(
     f: AnyRef,
     dataType: DataType,
     inputSchemas: Seq[Option[ScalaReflection.Schema]],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 2d5504ac00ffa..c60df14f04817 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4733,6 +4733,15 @@ object functions {
    * @since 2.0.0
    */
   def udf(f: AnyRef, dataType: DataType): UserDefinedFunction = {
+    if (!SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF)) {
+      val errorMsg = "You're using untyped Scala UDF, which does not have the input type " +
+        "information. Spark may blindly pass null to the Scala closure with primitive-type " +
+        "argument, and the closure will see the default value of the Java type for the null " +
+        "argument, e.g. `udf((x: Int) => x, IntegerType)`, the result is 0 for null input. " +
+        "You could use other typed Scala UDF APIs to avoid this problem, or set " +
+        s"${SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key} to true and use this API with caution."
+      throw new AnalysisException(errorMsg)
+    }
     SparkUserDefinedFunction(f, dataType, inputSchemas = Nil)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index cc3995516dcc2..cbe2e91a20d61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -134,10 +134,12 @@ class UDFSuite extends QueryTest with SharedSparkSession {
     assert(df1.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
     assert(df1.head().getDouble(0) >= 0.0)
 
-    val bar = udf(() => Math.random(), DataTypes.DoubleType).asNondeterministic()
-    val df2 = testData.select(bar())
-    assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
-    assert(df2.head().getDouble(0) >= 0.0)
+    withSQLConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key -> "true") {
+      val bar = udf(() => Math.random(), DataTypes.DoubleType).asNondeterministic()
+      val df2 = testData.select(bar())
+      assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+      assert(df2.head().getDouble(0) >= 0.0)
+    }
 
     val javaUdf = udf(new UDF0[Double] {
       override def call(): Double = Math.random()
@@ -441,16 +443,23 @@ class UDFSuite extends QueryTest with SharedSparkSession {
   }
 
   test("SPARK-25044 Verify null input handling for primitive types - with udf(Any, DataType)") {
-    val f = udf((x: Int) => x, IntegerType)
-    checkAnswer(
-      Seq(Integer.valueOf(1), null).toDF("x").select(f($"x")),
-      Row(1) :: Row(0) :: Nil)
+    withSQLConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key -> "true") {
+      val f = udf((x: Int) => x, IntegerType)
+      checkAnswer(
+        Seq(Integer.valueOf(1), null).toDF("x").select(f($"x")),
+        Row(1) :: Row(0) :: Nil)
+
+      val f2 = udf((x: Double) => x, DoubleType)
+      checkAnswer(
+        Seq(java.lang.Double.valueOf(1.1), null).toDF("x").select(f2($"x")),
+        Row(1.1) :: Row(0.0) :: Nil)
+    }
 
-    val f2 = udf((x: Double) => x, DoubleType)
-    checkAnswer(
-      Seq(java.lang.Double.valueOf(1.1), null).toDF("x").select(f2($"x")),
-      Row(1.1) :: Row(0.0) :: Nil)
+  }
 
+  test("use untyped Scala UDF should fail by default") {
+    val e = intercept[AnalysisException](udf((x: Int) => x, IntegerType))
+    assert(e.getMessage.contains("You're using untyped Scala UDF"))
   }
 
   test("SPARK-26308: udf with decimal") {

From abe0821ee9414a441461e5a4a6cf5b781b46a2e8 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Fri, 21 Feb 2020 15:32:11 +0800
Subject: [PATCH 114/185] [SPARK-30894][SQL] Make Size's nullable independent
 from SQL config changes

### What changes were proposed in this pull request?
In the PR, I propose to add the `legacySizeOfNull ` parameter to the `Size` expression, and pass the value of `spark.sql.legacy.sizeOfNull` if `legacySizeOfNull` is not provided on creation of `Size`.

### Why are the changes needed?
This allows to avoid the issue when the configuration change between different phases of planning, and this can silently break a query plan which can lead to crashes or data corruption.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By `CollectionExpressionsSuite`.

Closes #27658 from MaxGekk/Size-SQLConf-get-deps.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/collectionOperations.scala  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 2b123479a1909..cfa877b9c0892 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -90,9 +90,10 @@ trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression
       > SELECT _FUNC_(NULL);
        NULL
   """)
-case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Size(child: Expression, legacySizeOfNull: Boolean)
+  extends UnaryExpression with ExpectsInputTypes {
 
-  val legacySizeOfNull = SQLConf.get.legacySizeOfNull
+  def this(child: Expression) = this(child, SQLConf.get.legacySizeOfNull)
 
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
@@ -124,6 +125,10 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
   }
 }
 
+object Size {
+  def apply(child: Expression): Size = new Size(child)
+}
+
 /**
  * Returns an unordered array containing the keys of the map.
  */

From a5efbb284e29b1d879490a4ee2c9fa08acec42b0 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Fri, 21 Feb 2020 17:03:22 +0800
Subject: [PATCH 115/185] [SPARK-30809][SQL] Review and fix issues in SQL API
 docs

### What changes were proposed in this pull request?
- Add missing `since` annotation.
- Don't show classes under `org.apache.spark.sql.dynamicpruning` package in API docs.
- Fix the scope of `xxxExactNumeric` to remove it from the API docs.

### Why are the changes needed?
Avoid leaking APIs unintentionally in Spark 3.0.0.

### Does this PR introduce any user-facing change?
No. All these changes are to avoid leaking APIs unintentionally in Spark 3.0.0.

### How was this patch tested?
Manually generated the API docs and verified the above issues have been fixed.

Closes #27560 from xuanyuanking/SPARK-30809.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/rpc/RpcEndpointRef.scala |   2 +-
 project/SparkBuild.scala                      |   1 +
 .../sql/connector/ExternalCommandRunner.java  |   2 +
 .../connector/catalog/CatalogExtension.java   |   2 +
 .../sql/connector/catalog/CatalogPlugin.java  |   2 +
 .../spark/sql/connector/catalog/Catalogs.java | 127 ------------------
 .../catalog/DelegatingCatalogExtension.java   |   2 +
 .../sql/connector/catalog/Identifier.java     |   2 +
 .../connector/catalog/NamespaceChange.java    |   2 +
 .../catalog/SessionConfigSupport.java         |   2 +
 .../sql/connector/catalog/StagedTable.java    |   2 +
 .../catalog/StagingTableCatalog.java          |   2 +
 .../catalog/SupportsCatalogOptions.java       |   2 +
 .../sql/connector/catalog/SupportsDelete.java |   2 +
 .../connector/catalog/SupportsNamespaces.java |   2 +
 .../sql/connector/catalog/SupportsRead.java   |   2 +
 .../sql/connector/catalog/SupportsWrite.java  |   2 +
 .../spark/sql/connector/catalog/Table.java    |   2 +
 .../connector/catalog/TableCapability.java    |   2 +
 .../sql/connector/catalog/TableCatalog.java   |   2 +
 .../sql/connector/catalog/TableChange.java    |   2 +
 .../sql/connector/catalog/TableProvider.java  |   2 +
 .../sql/connector/expressions/Expression.java |   2 +
 .../connector/expressions/Expressions.java    |   2 +
 .../sql/connector/expressions/Literal.java    |   1 +
 .../connector/expressions/NamedReference.java |   2 +
 .../sql/connector/expressions/Transform.java  |   2 +
 .../spark/sql/connector/read/Batch.java       |   2 +
 .../sql/connector/read/InputPartition.java    |   2 +
 .../sql/connector/read/PartitionReader.java   |   2 +
 .../read/PartitionReaderFactory.java          |   2 +
 .../apache/spark/sql/connector/read/Scan.java |   2 +
 .../spark/sql/connector/read/ScanBuilder.java |   2 +
 .../spark/sql/connector/read/Statistics.java  |   2 +
 .../read/SupportsPushDownFilters.java         |   2 +
 .../read/SupportsPushDownRequiredColumns.java |   2 +
 .../read/SupportsReportPartitioning.java      |   2 +
 .../read/SupportsReportStatistics.java        |   2 +
 .../partitioning/ClusteredDistribution.java   |   2 +
 .../read/partitioning/Distribution.java       |   2 +
 .../read/partitioning/Partitioning.java       |   2 +
 .../spark/sql/connector/write/BatchWrite.java |   2 +
 .../spark/sql/connector/write/DataWriter.java |   2 +
 .../connector/write/DataWriterFactory.java    |   2 +
 .../sql/connector/write/LogicalWriteInfo.java |   2 +
 .../connector/write/PhysicalWriteInfo.java    |   2 +
 .../write/SupportsDynamicOverwrite.java       |   2 +
 .../connector/write/SupportsOverwrite.java    |   2 +
 .../sql/connector/write/SupportsTruncate.java |   2 +
 .../sql/connector/write/WriteBuilder.java     |   2 +
 .../connector/write/WriterCommitMessage.java  |   2 +
 .../sql/util/CaseInsensitiveStringMap.java    |   2 +
 .../expressions/bitwiseExpressions.scala      |   3 +-
 .../spark/sql/catalyst/expressions/hash.scala |   3 +-
 .../sql/catalyst/util/DataTypeJsonUtils.scala |  55 ++++++++
 .../sql/connector/catalog/Catalogs.scala      |  99 ++++++++++++++
 .../connector/expressions/expressions.scala   |   2 +-
 .../apache/spark/sql/sources/filters.scala    |   4 +
 .../org/apache/spark/sql/types/DataType.scala |  32 +----
 .../org/apache/spark/sql/types/numerics.scala |  14 +-
 .../apache/spark/sql/DataFrameWriterV2.scala  |   3 +
 61 files changed, 273 insertions(+), 169 deletions(-)
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Catalogs.java
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeJsonUtils.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/Catalogs.scala

diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
index 49d58929a97a4..56f3d377f8e2a 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
@@ -108,7 +108,7 @@ private[spark] abstract class RpcEndpointRef(conf: SparkConf)
 /**
  * An exception thrown if the RPC is aborted.
  */
-class RpcAbortException(message: String) extends Exception(message)
+private[spark] class RpcAbortException(message: String) extends Exception(message)
 
 /**
  * A wrapper for [[Future]] but add abort method.
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1c5c36ea8eae2..a07823cff2481 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -823,6 +823,7 @@ object Unidoc {
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/internal")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/memory")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/network")))
+      .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/rpc")))
       .map(_.filterNot(f =>
         f.getCanonicalPath.contains("org/apache/spark/shuffle") &&
         !f.getCanonicalPath.contains("org/apache/spark/shuffle/api")))
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/ExternalCommandRunner.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/ExternalCommandRunner.java
index 37534be31b6f6..a06483d8fa08b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/ExternalCommandRunner.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/ExternalCommandRunner.java
@@ -27,6 +27,8 @@
  * cores for Solr and so on.
  * <p>
  * This interface will be instantiated when end users call `SparkSession#executeCommand`.
+ *
+ * @since 3.0.0
  */
 @Unstable
 public interface ExternalCommandRunner {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogExtension.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogExtension.java
index 65e0b6be00ef3..61cb83c4f15ad 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogExtension.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogExtension.java
@@ -26,6 +26,8 @@
  * some custom logic and call the built-in session catalog at the end. For example, they can
  * implement {@code createTable}, do something else before calling {@code createTable} of the
  * built-in session catalog.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface CatalogExtension extends TableCatalog, SupportsNamespaces {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java
index 34fdc5dd4d0fb..2958538c21926 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java
@@ -38,6 +38,8 @@
  * {@code spark.sql.catalog.catalog-name.(key)=(value)} will be passed in the case insensitive
  * string map of options in initialization with the prefix removed.
  * {@code name}, is also passed and is the catalog's name; in this case, "catalog-name".
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface CatalogPlugin {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Catalogs.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Catalogs.java
deleted file mode 100644
index b6a896c52d535..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Catalogs.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.connector.catalog;
-
-import org.apache.spark.SparkException;
-import org.apache.spark.annotation.Private;
-import org.apache.spark.sql.internal.SQLConf;
-import org.apache.spark.sql.util.CaseInsensitiveStringMap;
-import org.apache.spark.util.Utils;
-
-import java.lang.reflect.InvocationTargetException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.NoSuchElementException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import static scala.collection.JavaConverters.mapAsJavaMapConverter;
-
-@Private
-public class Catalogs {
-  private Catalogs() {
-  }
-
-  /**
-   * Load and configure a catalog by name.
-   * <p>
-   * This loads, instantiates, and initializes the catalog plugin for each call; it does not cache
-   * or reuse instances.
-   *
-   * @param name a String catalog name
-   * @param conf a SQLConf
-   * @return an initialized CatalogPlugin
-   * @throws CatalogNotFoundException if the plugin class cannot be found
-   * @throws SparkException if the plugin class cannot be instantiated
-   */
-  public static CatalogPlugin load(String name, SQLConf conf)
-      throws CatalogNotFoundException, SparkException {
-    String pluginClassName;
-    try {
-      pluginClassName = conf.getConfString("spark.sql.catalog." + name);
-    } catch (NoSuchElementException e){
-      throw new CatalogNotFoundException(String.format(
-          "Catalog '%s' plugin class not found: spark.sql.catalog.%s is not defined", name, name));
-    }
-
-    ClassLoader loader = Utils.getContextOrSparkClassLoader();
-
-    try {
-      Class<?> pluginClass = loader.loadClass(pluginClassName);
-
-      if (!CatalogPlugin.class.isAssignableFrom(pluginClass)) {
-        throw new SparkException(String.format(
-            "Plugin class for catalog '%s' does not implement CatalogPlugin: %s",
-            name, pluginClassName));
-      }
-
-      CatalogPlugin plugin =
-        CatalogPlugin.class.cast(pluginClass.getDeclaredConstructor().newInstance());
-
-      plugin.initialize(name, catalogOptions(name, conf));
-
-      return plugin;
-
-    } catch (ClassNotFoundException e) {
-      throw new SparkException(String.format(
-          "Cannot find catalog plugin class for catalog '%s': %s", name, pluginClassName));
-
-    } catch (NoSuchMethodException e) {
-      throw new SparkException(String.format(
-          "Failed to find public no-arg constructor for catalog '%s': %s", name, pluginClassName),
-          e);
-
-    } catch (IllegalAccessException e) {
-      throw new SparkException(String.format(
-          "Failed to call public no-arg constructor for catalog '%s': %s", name, pluginClassName),
-          e);
-
-    } catch (InstantiationException e) {
-      throw new SparkException(String.format(
-          "Cannot instantiate abstract catalog plugin class for catalog '%s': %s", name,
-          pluginClassName), e.getCause());
-
-    } catch (InvocationTargetException e) {
-      throw new SparkException(String.format(
-          "Failed during instantiating constructor for catalog '%s': %s", name, pluginClassName),
-          e.getCause());
-    }
-  }
-
-  /**
-   * Extracts a named catalog's configuration from a SQLConf.
-   *
-   * @param name a catalog name
-   * @param conf a SQLConf
-   * @return a case insensitive string map of options starting with spark.sql.catalog.(name).
-   */
-  private static CaseInsensitiveStringMap catalogOptions(String name, SQLConf conf) {
-    Map<String, String> allConfs = mapAsJavaMapConverter(conf.getAllConfs()).asJava();
-    Pattern prefix = Pattern.compile("^spark\\.sql\\.catalog\\." + name + "\\.(.+)");
-
-    HashMap<String, String> options = new HashMap<>();
-    for (Map.Entry<String, String> entry : allConfs.entrySet()) {
-      Matcher matcher = prefix.matcher(entry.getKey());
-      if (matcher.matches() && matcher.groupCount() > 0) {
-        options.put(matcher.group(1), entry.getValue());
-      }
-    }
-
-    return new CaseInsensitiveStringMap(options);
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/DelegatingCatalogExtension.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/DelegatingCatalogExtension.java
index cf0eef96d5a9c..5a5195934413e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/DelegatingCatalogExtension.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/DelegatingCatalogExtension.java
@@ -33,6 +33,8 @@
  * by calling the built-in session catalog directly. This is created for convenience, so that users
  * only need to override some methods where they want to apply custom logic. For example, they can
  * override {@code createTable}, do something else before calling {@code super.createTable}.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public abstract class DelegatingCatalogExtension implements CatalogExtension {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java
index c3ac9c20db3b0..7b3a081aecdc0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java
@@ -21,6 +21,8 @@
 
 /**
  * Identifies an object in a catalog.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface Identifier {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/NamespaceChange.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/NamespaceChange.java
index 1db6877c8d0be..f4eee84fee968 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/NamespaceChange.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/NamespaceChange.java
@@ -30,6 +30,8 @@
  *       removeProperty("other_prop")
  *     )
  * </pre>
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface NamespaceChange {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SessionConfigSupport.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SessionConfigSupport.java
index fc076ee00a623..116efc1ada16a 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SessionConfigSupport.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SessionConfigSupport.java
@@ -23,6 +23,8 @@
  * A mix-in interface for {@link TableProvider}. Data sources can implement this interface to
  * propagate session configs with the specified key-prefix to all data source operations in this
  * session.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SessionConfigSupport extends TableProvider {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java
index 84b24f204bf9c..c328f83f9a0c2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java
@@ -36,6 +36,8 @@
  * write will be committed. The job concludes with a call to {@link #commitStagedChanges()}, at
  * which point implementations are expected to commit the table's metadata into the metastore along
  * with the data that was written by the writes from the write builder this table created.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface StagedTable extends Table {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java
index 1c8e9c5024376..9d4f6eafd820e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java
@@ -48,6 +48,8 @@
  * {@link BatchWrite#commit(WriterCommitMessage[])} is called,
  * {@link StagedTable#commitStagedChanges()} is called, at which point the staged table can
  * complete both the data write and the metadata swap operation atomically.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface StagingTableCatalog extends TableCatalog {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java
index 5225b12788c49..e779d5234cfb7 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java
@@ -26,6 +26,8 @@
  * sources use the `DataFrameWriter.save(path)` method, the option `path` can translate to a
  * PathIdentifier. A catalog can then use this PathIdentifier to check the existence of a table, or
  * whether a table can be created at a given directory.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SupportsCatalogOptions extends TableProvider {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java
index 80aa57ca18775..92461882fc75f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java
@@ -23,6 +23,8 @@
 /**
  * A mix-in interface for {@link Table} delete support. Data sources can implement this
  * interface to provide the ability to delete data from tables that matches filter expressions.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface SupportsDelete {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsNamespaces.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsNamespaces.java
index 190f1a14d7129..bd7e49ef6f08a 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsNamespaces.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsNamespaces.java
@@ -35,6 +35,8 @@
  * and uses Java packages as namespaces is not required to support the methods to create, alter, or
  * drop a namespace. Implementations are allowed to discover the existence of objects or namespaces
  * without throwing {@link NoSuchNamespaceException} when no namespace is found.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface SupportsNamespaces extends CatalogPlugin {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRead.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRead.java
index cdff1a38ff05f..a4cb66b2d11e2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRead.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsRead.java
@@ -26,6 +26,8 @@
  * A mix-in interface of {@link Table}, to indicate that it's readable. This adds
  * {@link #newScanBuilder(CaseInsensitiveStringMap)} that is used to create a scan for batch,
  * micro-batch, or continuous processing.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface SupportsRead extends Table {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java
index 90d79ed492ef3..19499d1061a9e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java
@@ -26,6 +26,8 @@
  * A mix-in interface of {@link Table}, to indicate that it's writable. This adds
  * {@link #newWriteBuilder(LogicalWriteInfo)} that is used to create a
  * write for batch or streaming.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface SupportsWrite extends Table {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java
index a493736c77277..8f7a87404837c 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java
@@ -36,6 +36,8 @@
  * The default implementation of {@link #partitioning()} returns an empty array of partitions, and
  * the default implementation of {@link #properties()} returns an empty map. These should be
  * overridden by implementations that support partitioning and table properties.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Table {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCapability.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCapability.java
index 9765118a8dbf7..6a64bf8441933 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCapability.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCapability.java
@@ -25,6 +25,8 @@
  * Tables use {@link Table#capabilities()} to return a set of capabilities. Each capability signals
  * to Spark that the table supports a feature identified by the capability. For example, returning
  * {@link #BATCH_READ} allows Spark to read from the table using a batch scan.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public enum TableCapability {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
index 2f102348ec517..357cd4b84f47a 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java
@@ -34,6 +34,8 @@
  * {@link #alterTable(Identifier, TableChange...)} will be normalized to match the case used in the
  * table schema when updating, renaming, or dropping existing columns when catalyst analysis is case
  * insensitive.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface TableCatalog extends CatalogPlugin {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java
index 5ce020912f213..81e988a62a058 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableChange.java
@@ -36,6 +36,8 @@
  *       deleteColumn("c")
  *     )
  * </pre>
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface TableChange {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java
index 732c5352a15ac..82731e2c8e1e8 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java
@@ -34,6 +34,8 @@
  * <p>
  * The major responsibility of this interface is to return a {@link Table} for read/write.
  * </p>
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface TableProvider {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expression.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expression.java
index 807731ac22948..a5b8e2224e711 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expression.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expression.java
@@ -21,6 +21,8 @@
 
 /**
  * Base class of the public logical expression API.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface Expression {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expressions.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expressions.java
index a06ac9f77c5b5..28fb71e59c320 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expressions.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Expressions.java
@@ -26,6 +26,8 @@
 
 /**
  * Helper methods to create logical transforms to pass into Spark.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public class Expressions {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Literal.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Literal.java
index 942744bf6a5d3..6474eb4ea4d01 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Literal.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Literal.java
@@ -27,6 +27,7 @@
  * the literal's {@link DataType SQL data type}.
  *
  * @param <T> the JVM type of a value held by the literal
+ * @since 3.0.0
  */
 @Experimental
 public interface Literal<T> extends Expression {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/NamedReference.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/NamedReference.java
index e6e5fa59c2499..6dd46cc063145 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/NamedReference.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/NamedReference.java
@@ -21,6 +21,8 @@
 
 /**
  * Represents a field or column reference in the public logical expression API.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface NamedReference extends Expression {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Transform.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Transform.java
index edac6df8a4d3e..ad78f005d9218 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Transform.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Transform.java
@@ -24,6 +24,8 @@
  * <p>
  * For example, the transform date(ts) is used to derive a date value from a timestamp column. The
  * transform name is "date" and its argument is a reference to the "ts" column.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public interface Transform extends Expression {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java
index 09592598a82f9..73aefa55ae5e1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java
@@ -23,6 +23,8 @@
  * A physical representation of a data source scan for batch queries. This interface is used to
  * provide physical information, like how many partitions the scanned data has, and how to read
  * records from the partitions.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Batch {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java
index bedf30803f73f..03eec00168012 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java
@@ -31,6 +31,8 @@
  * {@link PartitionReaderFactory#createColumnarReader(InputPartition)} on executors to do
  * the actual reading. So {@link InputPartition} must be serializable while {@link PartitionReader}
  * doesn't need to be.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface InputPartition extends Serializable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java
index 254ee56501b9a..23fbd95800e26 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java
@@ -31,6 +31,8 @@
  * for normal data sources, or {@link org.apache.spark.sql.vectorized.ColumnarBatch} for columnar
  * data sources(whose {@link PartitionReaderFactory#supportColumnarReads(InputPartition)}
  * returns true).
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface PartitionReader<T> extends Closeable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java
index 7fef69ce2a5b8..9dded247e89f3 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java
@@ -29,6 +29,8 @@
  * If Spark fails to execute any methods in the implementations of this interface or in the returned
  * {@link PartitionReader} (by throwing an exception), corresponding Spark task would fail and
  * get retried until hitting the maximum retry times.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface PartitionReaderFactory extends Serializable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Scan.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Scan.java
index c1584a58c117f..4146f217985b4 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Scan.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Scan.java
@@ -34,6 +34,8 @@
  * implemented, if the {@link Table} that creates this {@link Scan} returns
  * {@link TableCapability#BATCH_READ} support in its {@link Table#capabilities()}.
  * </p>
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Scan {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/ScanBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/ScanBuilder.java
index ca2cd59b2e442..cb3eea7680058 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/ScanBuilder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/ScanBuilder.java
@@ -23,6 +23,8 @@
  * An interface for building the {@link Scan}. Implementations can mixin SupportsPushDownXYZ
  * interfaces to do operator pushdown, and keep the operator pushdown result in the returned
  * {@link Scan}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface ScanBuilder {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java
index 42778b4e28a1a..a4ef785da2c65 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java
@@ -24,6 +24,8 @@
 /**
  * An interface to represent statistics for a data source, which is returned by
  * {@link SupportsReportStatistics#estimateStatistics()}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Statistics {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java
index bee9e5508ca66..6594af27734c7 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java
@@ -23,6 +23,8 @@
 /**
  * A mix-in interface for {@link ScanBuilder}. Data sources can implement this interface to
  * push down filters to the data source and reduce the size of the data to be read.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SupportsPushDownFilters extends ScanBuilder {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java
index 97143686d3efc..4f7da3c2a38f4 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java
@@ -24,6 +24,8 @@
  * A mix-in interface for {@link ScanBuilder}. Data sources can implement this
  * interface to push down required columns to the data source and only read these columns during
  * scan to reduce the size of the data to be read.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SupportsPushDownRequiredColumns extends ScanBuilder {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java
index 5bbc191730391..e7a27e07498fb 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java
@@ -26,6 +26,8 @@
  *
  * Note that, when a {@link Scan} implementation creates exactly one {@link InputPartition},
  * Spark may avoid adding a shuffle even if the reader does not implement this interface.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SupportsReportPartitioning extends Scan {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java
index 3cd5ef60dc533..b839fd5a4a726 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java
@@ -26,6 +26,8 @@
  * As of Spark 2.4, statistics are reported to the optimizer before any operator is pushed to the
  * data source. Implementations that return more accurate statistics based on pushed operators will
  * not improve query performance until the planner can push operators before getting stats.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface SupportsReportStatistics extends Scan {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/ClusteredDistribution.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/ClusteredDistribution.java
index 23d3c6b563d1c..ed0354484d7be 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/ClusteredDistribution.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/ClusteredDistribution.java
@@ -24,6 +24,8 @@
  * A concrete implementation of {@link Distribution}. Represents a distribution where records that
  * share the same values for the {@link #clusteredColumns} will be produced by the same
  * {@link PartitionReader}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public class ClusteredDistribution implements Distribution {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java
index fd397c760f05a..264b268e247e1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java
@@ -36,6 +36,8 @@
  * <ul>
  *   <li>{@link ClusteredDistribution}</li>
  * </ul>
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Distribution {}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Partitioning.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Partitioning.java
index f0b9410ee6e61..c2c474696f667 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Partitioning.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Partitioning.java
@@ -26,6 +26,8 @@
  * {@link SupportsReportPartitioning#outputPartitioning()}. Note that this should work
  * like a snapshot. Once created, it should be deterministic and always report the same number of
  * partitions and the same "satisfy" result for a certain distribution.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface Partitioning {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java
index 3e8b14172d6b2..c4b073458eac8 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java
@@ -36,6 +36,8 @@
  * do it manually in their Spark applications if they want to retry.
  *
  * Please refer to the documentation of commit/abort methods for detailed specifications.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface BatchWrite {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriter.java
index 59c69a18292d8..0022a71491738 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriter.java
@@ -56,6 +56,8 @@
  * Spark only accepts the commit message that arrives first and ignore others.
  *
  * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.catalyst.InternalRow}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface DataWriter<T> extends Closeable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriterFactory.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriterFactory.java
index 310575df05d97..33b7ff3dd27b1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriterFactory.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DataWriterFactory.java
@@ -31,6 +31,8 @@
  * Note that, the writer factory will be serialized and sent to executors, then the data writer
  * will be created on executors and do the actual writing. So this interface must be
  * serializable and {@link DataWriter} doesn't need to be.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface DataWriterFactory extends Serializable {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/LogicalWriteInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/LogicalWriteInfo.java
index 831f4e5aac481..0638970df925e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/LogicalWriteInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/LogicalWriteInfo.java
@@ -24,6 +24,8 @@
 /**
  * This interface contains logical write information that data sources can use when generating a
  * {@link WriteBuilder}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface LogicalWriteInfo {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/PhysicalWriteInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/PhysicalWriteInfo.java
index 55a092e39970e..248a6e69ac11f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/PhysicalWriteInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/PhysicalWriteInfo.java
@@ -23,6 +23,8 @@
 /**
  * This interface contains physical write information that data sources can use when
  * generating a {@link DataWriterFactory} or a {@link StreamingDataWriterFactory}.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface PhysicalWriteInfo {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsDynamicOverwrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsDynamicOverwrite.java
index 90e668d1935b1..bda0eed93c98a 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsDynamicOverwrite.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsDynamicOverwrite.java
@@ -26,6 +26,8 @@
  * <p>
  * This is provided to implement SQL compatible with Hive table operations but is not recommended.
  * Instead, use the {@link SupportsOverwrite overwrite by filter API} to explicitly replace data.
+ *
+ * @since 3.0.0
  */
 public interface SupportsDynamicOverwrite extends WriteBuilder {
   /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsOverwrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsOverwrite.java
index 6063a155ee209..ffa106c0b72f0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsOverwrite.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsOverwrite.java
@@ -25,6 +25,8 @@
  * <p>
  * Overwriting data by filter will delete any data that matches the filter and replace it with data
  * that is committed in the write.
+ *
+ * @since 3.0.0
  */
 public interface SupportsOverwrite extends WriteBuilder, SupportsTruncate {
   /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsTruncate.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsTruncate.java
index 15a68375032d2..b89d409abea6f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsTruncate.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/SupportsTruncate.java
@@ -21,6 +21,8 @@
  * Write builder trait for tables that support truncation.
  * <p>
  * Truncation removes all data in a table and replaces it with data that is committed in the write.
+ *
+ * @since 3.0.0
  */
 public interface SupportsTruncate extends WriteBuilder {
   /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java
index a8d99a8f04b1b..5398ca46e9777 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java
@@ -28,6 +28,8 @@
  *
  * Unless modified by a mixin interface, the {@link BatchWrite} configured by this builder is to
  * append data without affecting existing data.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface WriteBuilder {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriterCommitMessage.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriterCommitMessage.java
index 823f955bffb53..f340d20c3549f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriterCommitMessage.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriterCommitMessage.java
@@ -29,6 +29,8 @@
  *
  * This is an empty interface, data sources should define their own message class and use it when
  * generating messages at executor side and handling the messages at driver side.
+ *
+ * @since 3.0.0
  */
 @Evolving
 public interface WriterCommitMessage extends Serializable {}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java
index c344a62be40c1..de7d98ba48c31 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java
@@ -36,6 +36,8 @@
  * <p>
  * Methods that return keys in this map, like {@link #entrySet()} and {@link #keySet()}, return
  * keys converted to lower case. This map doesn't allow null key.
+ *
+ * @since 3.0.0
  */
 @Experimental
 public class CaseInsensitiveStringMap implements Map<String, String> {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
index 56f170cd77073..72a8f7e99729b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
@@ -162,7 +162,8 @@ case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInp
     Examples:
       > SELECT _FUNC_(0);
        0
-  """)
+  """,
+  since = "3.0.0")
 case class BitwiseCount(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(IntegralType, BooleanType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 52429a63b306d..d8eed27df79df 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -601,7 +601,8 @@ object Murmur3HashFunction extends InterpretedHashFunction {
     Examples:
       > SELECT _FUNC_('Spark', array(123), 2);
        5602566077635097486
-  """)
+  """,
+  since = "3.0.0")
 case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
   def this(arguments: Seq[Expression]) = this(arguments, 42L)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeJsonUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeJsonUtils.scala
new file mode 100644
index 0000000000000..e75429c58cc7b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DataTypeJsonUtils.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import com.fasterxml.jackson.core.{JsonGenerator, JsonParser}
+import com.fasterxml.jackson.databind.{DeserializationContext, JsonDeserializer, JsonSerializer, SerializerProvider}
+import org.json4s.JsonAST.JValue
+import org.json4s.jackson.{JValueDeserializer, JValueSerializer}
+
+import org.apache.spark.sql.types.DataType
+
+object DataTypeJsonUtils {
+  /**
+   * Jackson serializer for [[DataType]]. Internally this delegates to json4s based serialization.
+   */
+  class DataTypeJsonSerializer extends JsonSerializer[DataType] {
+    private val delegate = new JValueSerializer
+    override def serialize(
+      value: DataType,
+      gen: JsonGenerator,
+      provider: SerializerProvider): Unit = {
+      delegate.serialize(value.jsonValue, gen, provider)
+    }
+  }
+
+  /**
+   * Jackson deserializer for [[DataType]]. Internally this delegates to json4s based
+   * deserialization.
+   */
+  class DataTypeJsonDeserializer extends JsonDeserializer[DataType] {
+    private val delegate = new JValueDeserializer(classOf[Any])
+
+    override def deserialize(
+      jsonParser: JsonParser,
+      deserializationContext: DeserializationContext): DataType = {
+      val json = delegate.deserialize(jsonParser, deserializationContext)
+      DataType.parseDataType(json.asInstanceOf[JValue])
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/Catalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/Catalogs.scala
new file mode 100644
index 0000000000000..cc4395e7bd62d
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/Catalogs.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog
+
+import java.lang.reflect.InvocationTargetException
+import java.util
+import java.util.NoSuchElementException
+import java.util.regex.Pattern
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.Utils
+
+private[sql] object Catalogs {
+  /**
+   * Load and configure a catalog by name.
+   * <p>
+   * This loads, instantiates, and initializes the catalog plugin for each call; it does not cache
+   * or reuse instances.
+   *
+   * @param name a String catalog name
+   * @param conf a SQLConf
+   * @return an initialized CatalogPlugin
+   * @throws CatalogNotFoundException if the plugin class cannot be found
+   * @throws org.apache.spark.SparkException           if the plugin class cannot be instantiated
+   */
+  @throws[CatalogNotFoundException]
+  @throws[SparkException]
+  def load(name: String, conf: SQLConf): CatalogPlugin = {
+    val pluginClassName = try {
+      conf.getConfString("spark.sql.catalog." + name)
+    } catch {
+      case _: NoSuchElementException =>
+        throw new CatalogNotFoundException(
+          s"Catalog '$name' plugin class not found: spark.sql.catalog.$name is not defined")
+    }
+    val loader = Utils.getContextOrSparkClassLoader
+    try {
+      val pluginClass = loader.loadClass(pluginClassName)
+      if (!classOf[CatalogPlugin].isAssignableFrom(pluginClass)) {
+        throw new SparkException(
+          s"Plugin class for catalog '$name' does not implement CatalogPlugin: $pluginClassName")
+      }
+      val plugin = pluginClass.getDeclaredConstructor().newInstance().asInstanceOf[CatalogPlugin]
+      plugin.initialize(name, catalogOptions(name, conf))
+      plugin
+    } catch {
+      case _: ClassNotFoundException =>
+        throw new SparkException(
+          s"Cannot find catalog plugin class for catalog '$name': $pluginClassName")
+      case e: NoSuchMethodException =>
+        throw new SparkException(
+          s"Failed to find public no-arg constructor for catalog '$name': $pluginClassName)", e)
+      case e: IllegalAccessException =>
+        throw new SparkException(
+          s"Failed to call public no-arg constructor for catalog '$name': $pluginClassName)", e)
+      case e: InstantiationException =>
+        throw new SparkException("Cannot instantiate abstract catalog plugin class for " +
+          s"catalog '$name': $pluginClassName", e.getCause)
+      case e: InvocationTargetException =>
+        throw new SparkException("Failed during instantiating constructor for catalog " +
+          s"'$name': $pluginClassName", e.getCause)
+    }
+  }
+
+  /**
+   * Extracts a named catalog's configuration from a SQLConf.
+   *
+   * @param name a catalog name
+   * @param conf a SQLConf
+   * @return a case insensitive string map of options starting with spark.sql.catalog.(name).
+   */
+  private def catalogOptions(name: String, conf: SQLConf) = {
+    val prefix = Pattern.compile("^spark\\.sql\\.catalog\\." + name + "\\.(.+)")
+    val options = new util.HashMap[String, String]
+    conf.getAllConfs.foreach {
+      case (key, value) =>
+        val matcher = prefix.matcher(key)
+        if (matcher.matches && matcher.groupCount > 0) options.put(matcher.group(1), value)
+    }
+    new CaseInsensitiveStringMap(options)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala
index 84adddf2671f5..ea26847ac3cef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala
@@ -64,7 +64,7 @@ private[sql] object LogicalExpressions {
 /**
  * Allows Spark to rewrite the given references of the transform during analysis.
  */
-sealed trait RewritableTransform extends Transform {
+private[sql] sealed trait RewritableTransform extends Transform {
   /** Creates a copy of this transform with the new analyzed references. */
   def withReferences(newReferences: Seq[NamedReference]): Transform
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala
index fc7d33e823a99..020dd79f8f0d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -221,6 +221,8 @@ case class StringContains(attribute: String, value: String) extends Filter {
 
 /**
  * A filter that always evaluates to `true`.
+ *
+ * @since 3.0.0
  */
 @Evolving
 case class AlwaysTrue() extends Filter {
@@ -233,6 +235,8 @@ object AlwaysTrue extends AlwaysTrue {
 
 /**
  * A filter that always evaluates to `false`.
+ *
+ * @since 3.0.0
  */
 @Evolving
 case class AlwaysFalse() extends Filter {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 8a8cea194bf2c..7449a28e069d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -21,20 +21,17 @@ import java.util.Locale
 
 import scala.util.control.NonFatal
 
-import com.fasterxml.jackson.core.{JsonGenerator, JsonParser}
-import com.fasterxml.jackson.databind.{DeserializationContext, JsonDeserializer, JsonSerializer, SerializerProvider}
-import com.fasterxml.jackson.databind.`type`.TypeFactory
 import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize}
 import org.json4s._
 import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
-import org.json4s.jackson.{JValueDeserializer, JValueSerializer}
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.Stable
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.{Cast, Expression}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.catalyst.util.DataTypeJsonUtils.{DataTypeJsonDeserializer, DataTypeJsonSerializer}
 import org.apache.spark.sql.catalyst.util.StringUtils.StringConcat
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy
@@ -485,30 +482,3 @@ object DataType {
     }
   }
 }
-
-/**
- * Jackson serializer for [[DataType]]. Internally this delegates to json4s based serialization.
- */
-class DataTypeJsonSerializer extends JsonSerializer[DataType] {
-  private val delegate = new JValueSerializer
-  override def serialize(
-      value: DataType,
-      gen: JsonGenerator,
-      provider: SerializerProvider): Unit = {
-    delegate.serialize(value.jsonValue, gen, provider)
-  }
-}
-
-/**
- * Jackson deserializer for [[DataType]]. Internally this delegates to json4s based deserialization.
- */
-class DataTypeJsonDeserializer extends JsonDeserializer[DataType] {
-  private val delegate = new JValueDeserializer(classOf[Any])
-
-  override def deserialize(
-      jsonParser: JsonParser,
-      deserializationContext: DeserializationContext): DataType = {
-    val json = delegate.deserialize(jsonParser, deserializationContext)
-    DataType.parseDataType(json.asInstanceOf[JValue])
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/numerics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/numerics.scala
index b5226213effc4..3956629cf6a57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/numerics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/numerics.scala
@@ -22,7 +22,7 @@ import scala.math.Ordering
 
 import org.apache.spark.sql.types.Decimal.DecimalIsConflicted
 
-object ByteExactNumeric extends ByteIsIntegral with Ordering.ByteOrdering {
+private[sql] object ByteExactNumeric extends ByteIsIntegral with Ordering.ByteOrdering {
   private def checkOverflow(res: Int, x: Byte, y: Byte, op: String): Unit = {
     if (res > Byte.MaxValue || res < Byte.MinValue) {
       throw new ArithmeticException(s"$x $op $y caused overflow.")
@@ -56,7 +56,7 @@ object ByteExactNumeric extends ByteIsIntegral with Ordering.ByteOrdering {
 }
 
 
-object ShortExactNumeric extends ShortIsIntegral with Ordering.ShortOrdering {
+private[sql] object ShortExactNumeric extends ShortIsIntegral with Ordering.ShortOrdering {
   private def checkOverflow(res: Int, x: Short, y: Short, op: String): Unit = {
     if (res > Short.MaxValue || res < Short.MinValue) {
       throw new ArithmeticException(s"$x $op $y caused overflow.")
@@ -90,7 +90,7 @@ object ShortExactNumeric extends ShortIsIntegral with Ordering.ShortOrdering {
 }
 
 
-object IntegerExactNumeric extends IntIsIntegral with Ordering.IntOrdering {
+private[sql] object IntegerExactNumeric extends IntIsIntegral with Ordering.IntOrdering {
   override def plus(x: Int, y: Int): Int = Math.addExact(x, y)
 
   override def minus(x: Int, y: Int): Int = Math.subtractExact(x, y)
@@ -100,7 +100,7 @@ object IntegerExactNumeric extends IntIsIntegral with Ordering.IntOrdering {
   override def negate(x: Int): Int = Math.negateExact(x)
 }
 
-object LongExactNumeric extends LongIsIntegral with Ordering.LongOrdering {
+private[sql] object LongExactNumeric extends LongIsIntegral with Ordering.LongOrdering {
   override def plus(x: Long, y: Long): Long = Math.addExact(x, y)
 
   override def minus(x: Long, y: Long): Long = Math.subtractExact(x, y)
@@ -117,7 +117,7 @@ object LongExactNumeric extends LongIsIntegral with Ordering.LongOrdering {
     }
 }
 
-object FloatExactNumeric extends FloatIsFractional {
+private[sql] object FloatExactNumeric extends FloatIsFractional {
   private def overflowException(x: Float, dataType: String) =
     throw new ArithmeticException(s"Casting $x to $dataType causes overflow")
 
@@ -151,7 +151,7 @@ object FloatExactNumeric extends FloatIsFractional {
   override def compare(x: Float, y: Float): Int = java.lang.Float.compare(x, y)
 }
 
-object DoubleExactNumeric extends DoubleIsFractional {
+private[sql] object DoubleExactNumeric extends DoubleIsFractional {
   private def overflowException(x: Double, dataType: String) =
     throw new ArithmeticException(s"Casting $x to $dataType causes overflow")
 
@@ -179,7 +179,7 @@ object DoubleExactNumeric extends DoubleIsFractional {
   override def compare(x: Double, y: Double): Int = java.lang.Double.compare(x, y)
 }
 
-object DecimalExactNumeric extends DecimalIsConflicted {
+private[sql] object DecimalExactNumeric extends DecimalIsConflicted {
   override def toInt(x: Decimal): Int = x.roundToInt()
 
   override def toLong(x: Decimal): Long = x.roundToLong()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
index cf6bde5a2bcb9..45a9b28154028 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala
@@ -246,6 +246,7 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T])
 /**
  * Configuration methods common to create/replace operations and insert/overwrite operations.
  * @tparam R builder type to return
+ * @since 3.0.0
  */
 trait WriteConfigMethods[R] {
   /**
@@ -293,6 +294,8 @@ trait WriteConfigMethods[R] {
 
 /**
  * Trait to restrict calls to create and replace operations.
+ *
+ * @since 3.0.0
  */
 trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] {
   /**

From 6058ce97b989606ffedddf32778ce38f322cfb9e Mon Sep 17 00:00:00 2001
From: maryannxue <maryannxue@apache.org>
Date: Fri, 21 Feb 2020 22:49:20 +0800
Subject: [PATCH 116/185] [SPARK-30906][SQL] Turning off AQE in CacheManager is
 not thread-safe

### What changes were proposed in this pull request?
This PR aims to fix the thread-safety issue in turning off AQE for CacheManager by cloning the current session and changing the AQE conf on the cloned session.
This PR also adds a utility function for cloning the session with AQE disabled conf value, which can be shared by another caller.

### Why are the changes needed?
To fix the potential thread-unsafe problem.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Manually tested CachedTableSuite with AQE settings enabled.

Closes #27659 from maryannxue/spark-30906.

Authored-by: maryannxue <maryannxue@apache.org>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/CacheManager.scala    | 28 +++++++++----------
 .../spark/sql/execution/QueryExecution.scala  | 27 +++++++++++-------
 .../adaptive/AdaptiveSparkPlanExec.scala      |  1 -
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 413bd7b29cf45..ad33ce5ef60ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -79,20 +79,16 @@ class CacheManager extends Logging {
     if (lookupCachedData(planToCache).nonEmpty) {
       logWarning("Asked to cache already cached data.")
     } else {
-      val sparkSession = query.sparkSession
-      val qe = sparkSession.sessionState.executePlan(planToCache)
-      val originalValue = sparkSession.sessionState.conf.getConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED)
-      val inMemoryRelation = try {
-        // Avoiding changing the output partitioning, here disable AQE.
-        sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false)
+      // Turn off AQE so that the outputPartitioning of the underlying plan can be leveraged.
+      val sessionWithAqeOff = QueryExecution.getOrCloneSessionWithAqeOff(query.sparkSession)
+      val inMemoryRelation = sessionWithAqeOff.withActive {
+        val qe = sessionWithAqeOff.sessionState.executePlan(planToCache)
         InMemoryRelation(
-          sparkSession.sessionState.conf.useCompression,
-          sparkSession.sessionState.conf.columnBatchSize, storageLevel,
+          sessionWithAqeOff.sessionState.conf.useCompression,
+          sessionWithAqeOff.sessionState.conf.columnBatchSize, storageLevel,
           qe.executedPlan,
           tableName,
           optimizedPlan = qe.optimizedPlan)
-      } finally {
-        sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, originalValue)
       }
 
       this.synchronized {
@@ -194,10 +190,14 @@ class CacheManager extends Logging {
     }
     needToRecache.map { cd =>
       cd.cachedRepresentation.cacheBuilder.clearCache()
-      val qe = spark.sessionState.executePlan(cd.plan)
-      val newCache = InMemoryRelation(
-        cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = qe.executedPlan),
-        optimizedPlan = qe.optimizedPlan)
+      // Turn off AQE so that the outputPartitioning of the underlying plan can be leveraged.
+      val sessionWithAqeOff = QueryExecution.getOrCloneSessionWithAqeOff(spark)
+      val newCache = sessionWithAqeOff.withActive {
+        val qe = sessionWithAqeOff.sessionState.executePlan(cd.plan)
+        InMemoryRelation(
+          cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = qe.executedPlan),
+          optimizedPlan = qe.optimizedPlan)
+      }
       val recomputedPlan = cd.copy(cachedRepresentation = newCache)
       this.synchronized {
         if (lookupCachedData(recomputedPlan.plan).nonEmpty) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 4571e69b302f6..20bc289c65586 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -276,14 +276,7 @@ object QueryExecution {
    */
   private[execution] def preparations(sparkSession: SparkSession): Seq[Rule[SparkPlan]] = {
 
-    val sparkSessionWithAdaptiveExecutionOff =
-    if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) {
-      val session = sparkSession.cloneSession()
-      session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false)
-      session
-    } else {
-      sparkSession
-    }
+    val sparkSessionWithAqeOff = getOrCloneSessionWithAqeOff(sparkSession)
 
     Seq(
       // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op
@@ -291,8 +284,8 @@ object QueryExecution {
       InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession)),
       // If the following rules apply, it means the main query is not AQE-ed, so we make sure the
       // subqueries are not AQE-ed either.
-      PlanDynamicPruningFilters(sparkSessionWithAdaptiveExecutionOff),
-      PlanSubqueries(sparkSessionWithAdaptiveExecutionOff),
+      PlanDynamicPruningFilters(sparkSessionWithAqeOff),
+      PlanSubqueries(sparkSessionWithAqeOff),
       EnsureRequirements(sparkSession.sessionState.conf),
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf,
         sparkSession.sessionState.columnarRules),
@@ -341,4 +334,18 @@ object QueryExecution {
     val sparkPlan = createSparkPlan(spark, spark.sessionState.planner, plan.clone())
     prepareExecutedPlan(spark, sparkPlan)
   }
+
+  /**
+   * Returns a cloned [[SparkSession]] with adaptive execution disabled, or the original
+   * [[SparkSession]] if its adaptive execution is already disabled.
+   */
+  def getOrCloneSessionWithAqeOff[T](session: SparkSession): SparkSession = {
+    if (!session.sessionState.conf.adaptiveExecutionEnabled) {
+      session
+    } else {
+      val newSession = session.cloneSession()
+      newSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false)
+      newSession
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 3f20b59361988..4036424bbacaa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -467,7 +467,6 @@ case class AdaptiveSparkPlanExec(
   private def reOptimize(logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = {
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
-    SparkSession.setActiveSession(context.session)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
     val newPlan = applyPhysicalRules(sparkPlan, preprocessingRules ++ queryStagePreparationRules)
     (newPlan, optimized)

From 1f0300fb16ad8b2670a6194b5ba2d5f4d275e987 Mon Sep 17 00:00:00 2001
From: Eric Wu <492960551@qq.com>
Date: Fri, 21 Feb 2020 23:36:14 +0800
Subject: [PATCH 117/185] [SPARK-30764][SQL] Improve the readability of EXPLAIN
 FORMATTED style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
The style of `EXPLAIN FORMATTED` output needs to be improved. We’ve already got some observations/ideas in
https://github.com/apache/spark/pull/27368#discussion_r376694496
https://github.com/apache/spark/pull/27368#discussion_r376927143

Observations/Ideas:
1. Using comma as the separator is not clear, especially commas are used inside the expressions too.
2. Show the column counts first? For example, `Results [4]: …`
3. Currently the attribute names are automatically generated, this need to refined.
4. Add arguments field in common implementations as `EXPLAIN EXTENDED` did by calling `argString` in `TreeNode.simpleString`. This will eliminate most existing minor differences between
`EXPLAIN EXTENDED` and `EXPLAIN FORMATTED`.
5. Another improvement we can do is: the generated alias shouldn't include attribute id. collect_set(val, 0, 0)#123 looks clearer than collect_set(val#456, 0, 0)#123

This PR is currently addressing comments 2 & 4, and open for more discussions on improving readability.

### Why are the changes needed?
The readability of `EXPLAIN FORMATTED` need to be improved, which will help user better understand the query plan.

### Does this PR introduce any user-facing change?
Yes, `EXPLAIN FORMATTED` output style changed.

### How was this patch tested?
Update expect results of test cases in explain.sql

Closes #27509 from Eric5553/ExplainFormattedRefine.

Authored-by: Eric Wu <492960551@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 python/pyspark/sql/dataframe.py               |   3 +-
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  17 +-
 .../sql/execution/DataSourceScanExec.scala    |   4 +-
 .../spark/sql/execution/ExplainUtils.scala    |  14 +-
 .../spark/sql/execution/SparkPlan.scala       |  65 +-
 .../aggregate/BaseAggregateExec.scala         |  15 +-
 .../execution/basicPhysicalOperators.scala    |   6 +-
 .../sql/execution/exchange/Exchange.scala     |   2 +-
 .../joins/CartesianProductExec.scala          |   2 +-
 .../spark/sql/execution/joins/HashJoin.scala  |   6 +-
 .../execution/joins/SortMergeJoinExec.scala   |   6 +-
 .../sql-tests/results/explain.sql.out         | 577 +++++++++---------
 12 files changed, 397 insertions(+), 320 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8325b6814f182..971cdb128ee90 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -284,7 +284,8 @@ def explain(self, extended=None, mode=None):
         == Physical Plan ==
         * Scan ExistingRDD (1)
         (1) Scan ExistingRDD [codegen id : 1]
-        Output: [age#0, name#1]
+        Output [2]: [age#0, name#1]
+        ...
 
         .. versionchanged:: 3.0.0
            Added optional argument `mode` to specify the expected output format of plans.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index c5abb6378ff7c..12482667efa0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.plans
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreeNode, TreeNodeTag}
-import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
@@ -189,9 +188,19 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     val codegenIdStr =
       getTagValue(QueryPlan.CODEGEN_ID_TAG).map(id => s"[codegen id : $id]").getOrElse("")
     val operatorId = getTagValue(QueryPlan.OP_ID_TAG).map(id => s"$id").getOrElse("unknown")
-    s"""
-       |($operatorId) $nodeName $codegenIdStr
-     """.stripMargin
+    val baseStr = s"($operatorId) $nodeName $codegenIdStr"
+    val argumentString = argString(SQLConf.get.maxToStringFields)
+
+    if (argumentString.nonEmpty) {
+      s"""
+         |$baseStr
+         |Arguments: $argumentString
+      """.stripMargin
+    } else {
+      s"""
+         |$baseStr
+      """.stripMargin
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 0d759085a7e2c..c1f7f0a195491 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -76,7 +76,7 @@ trait DataSourceScanExec extends LeafExecNode {
 
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Output: ${producedAttributes.mkString("[", ", ", "]")}
+       |${ExplainUtils.generateFieldString("Output", producedAttributes)}
        |${metadataStr.mkString("\n")}
      """.stripMargin
   }
@@ -377,7 +377,7 @@ case class FileSourceScanExec(
 
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Output: ${producedAttributes.mkString("[", ", ", "]")}
+       |${ExplainUtils.generateFieldString("Output", producedAttributes)}
        |${metadataStr.mkString("\n")}
      """.stripMargin
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala
index d4fe272f8c95f..5d4309357895b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala
@@ -23,7 +23,6 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.{Expression, PlanExpression}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 
 object ExplainUtils {
   /**
@@ -171,7 +170,7 @@ object ExplainUtils {
     var currentCodegenId = -1
     plan.foreach {
       case p: WholeStageCodegenExec => currentCodegenId = p.codegenStageId
-      case p: InputAdapter => currentCodegenId = -1
+      case _: InputAdapter => currentCodegenId = -1
       case other: QueryPlan[_] =>
         if (currentCodegenId != -1) {
           other.setTagValue(QueryPlan.CODEGEN_ID_TAG, currentCodegenId)
@@ -182,6 +181,17 @@ object ExplainUtils {
     }
   }
 
+  /**
+   * Generate detailed field string with different format based on type of input value
+   */
+  def generateFieldString(fieldName: String, values: Any): String = values match {
+    case iter: Iterable[_] if (iter.size == 0) => s"${fieldName}: []"
+    case iter: Iterable[_] => s"${fieldName} [${iter.size}]: ${iter.mkString("[", ", ", "]")}"
+    case str: String if (str == null || str.isEmpty) => s"${fieldName}: None"
+    case str: String => s"${fieldName}: ${str}"
+    case _ => throw new IllegalArgumentException(s"Unsupported type for argument values: $values")
+  }
+
   /**
    * Given a input plan, returns an array of tuples comprising of :
    *  1. Hosting opeator id.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 3301e9b5ab180..f5bb554682eab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 object SparkPlan {
@@ -512,10 +513,22 @@ trait LeafExecNode extends SparkPlan {
   override final def children: Seq[SparkPlan] = Nil
   override def producedAttributes: AttributeSet = outputSet
   override def verboseStringWithOperatorId(): String = {
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Output: ${producedAttributes.mkString("[", ", ", "]")}
-     """.stripMargin
+    val argumentString = argString(SQLConf.get.maxToStringFields)
+    val baseStr = s"(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}"
+    val outputStr = s"${ExplainUtils.generateFieldString("Output", producedAttributes)}"
+
+    if (argumentString.nonEmpty) {
+      s"""
+         |$baseStr
+         |$outputStr
+         |Arguments: $argumentString
+      """.stripMargin
+    } else {
+      s"""
+         |$baseStr
+         |$outputStr
+      """.stripMargin
+    }
   }
 }
 
@@ -531,10 +544,22 @@ trait UnaryExecNode extends SparkPlan {
 
   override final def children: Seq[SparkPlan] = child :: Nil
   override def verboseStringWithOperatorId(): String = {
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Input: ${child.output.mkString("[", ", ", "]")}
-     """.stripMargin
+    val argumentString = argString(SQLConf.get.maxToStringFields)
+    val baseStr = s"(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}"
+    val inputStr = s"${ExplainUtils.generateFieldString("Input", child.output)}"
+
+    if (argumentString.nonEmpty) {
+      s"""
+         |$baseStr
+         |$inputStr
+         |Arguments: $argumentString
+      """.stripMargin
+    } else {
+      s"""
+         |$baseStr
+         |$inputStr
+      """.stripMargin
+    }
   }
 }
 
@@ -544,10 +569,24 @@ trait BinaryExecNode extends SparkPlan {
 
   override final def children: Seq[SparkPlan] = Seq(left, right)
   override def verboseStringWithOperatorId(): String = {
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Left output: ${left.output.mkString("[", ", ", "]")}
-       |Right output: ${right.output.mkString("[", ", ", "]")}
-     """.stripMargin
+    val argumentString = argString(SQLConf.get.maxToStringFields)
+    val baseStr = s"(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}"
+    val leftOutputStr = s"${ExplainUtils.generateFieldString("Left output", left.output)}"
+    val rightOutputStr = s"${ExplainUtils.generateFieldString("Right output", right.output)}"
+
+    if (argumentString.nonEmpty) {
+      s"""
+         |$baseStr
+         |$leftOutputStr
+         |$rightOutputStr
+         |Arguments: $argumentString
+      """.stripMargin
+    } else {
+      s"""
+         |$baseStr
+         |$leftOutputStr
+         |$rightOutputStr
+      """.stripMargin
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
index 0eaa0f53fdacd..19d7263feb2d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
@@ -31,18 +31,13 @@ trait BaseAggregateExec extends UnaryExecNode {
   def resultExpressions: Seq[NamedExpression]
 
   override def verboseStringWithOperatorId(): String = {
-    val inputString = child.output.mkString("[", ", ", "]")
-    val keyString = groupingExpressions.mkString("[", ", ", "]")
-    val functionString = aggregateExpressions.mkString("[", ", ", "]")
-    val aggregateAttributeString = aggregateAttributes.mkString("[", ", ", "]")
-    val resultString = resultExpressions.mkString("[", ", ", "]")
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Input: $inputString
-       |Keys: $keyString
-       |Functions: $functionString
-       |Aggregate Attributes: $aggregateAttributeString
-       |Results: $resultString
+       |${ExplainUtils.generateFieldString("Input", child.output)}
+       |${ExplainUtils.generateFieldString("Keys", groupingExpressions)}
+       |${ExplainUtils.generateFieldString("Functions", aggregateExpressions)}
+       |${ExplainUtils.generateFieldString("Aggregate Attributes", aggregateAttributes)}
+       |${ExplainUtils.generateFieldString("Results", resultExpressions)}
      """.stripMargin
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 4e65bbf75282f..c3e259d196ba2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -87,8 +87,8 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan)
   override def verboseStringWithOperatorId(): String = {
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Output    : ${projectList.mkString("[", ", ", "]")}
-       |Input     : ${child.output.mkString("[", ", ", "]")}
+       |${ExplainUtils.generateFieldString("Output", projectList)}
+       |${ExplainUtils.generateFieldString("Input", child.output)}
      """.stripMargin
   }
 }
@@ -244,7 +244,7 @@ case class FilterExec(condition: Expression, child: SparkPlan)
   override def verboseStringWithOperatorId(): String = {
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Input     : ${child.output.mkString("[", ", ", "]")}
+       |${ExplainUtils.generateFieldString("Input", child.output)}
        |Condition : ${condition}
      """.stripMargin
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala
index 849ff384c130a..dda9a637194fa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala
@@ -92,7 +92,7 @@ case class ReusedExchangeExec(override val output: Seq[Attribute], child: Exchan
     val reuse_op_str = ExplainUtils.getOpId(child)
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${cdgen} [Reuses operator id: $reuse_op_str]
-       |Output : ${output}
+       |${ExplainUtils.generateFieldString("Output", output)}
      """.stripMargin
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
index 29645a736548c..7e2f487fdcc5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -73,7 +73,7 @@ case class CartesianProductExec(
 
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Join condition: ${joinCondStr}
+       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
      """.stripMargin
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 137f0b87a2f3d..99cf60273bf08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -49,9 +49,9 @@ trait HashJoin {
 
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Left keys: ${leftKeys}
-       |Right keys: ${rightKeys}
-       |Join condition: ${joinCondStr}
+       |${ExplainUtils.generateFieldString("Left keys", leftKeys)}
+       |${ExplainUtils.generateFieldString("Right keys", rightKeys)}
+       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
      """.stripMargin
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 62eea611556ff..7a08dd1afd3a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -63,9 +63,9 @@ case class SortMergeJoinExec(
     } else "None"
     s"""
        |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |Left keys : ${leftKeys}
-       |Right keys: ${rightKeys}
-       |Join condition : ${joinCondStr}
+       |${ExplainUtils.generateFieldString("Left keys", leftKeys)}
+       |${ExplainUtils.generateFieldString("Right keys", rightKeys)}
+       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
      """.stripMargin
   }
 
diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
index bc28d7f87bf00..40b26e6fb64ce 100644
--- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
@@ -65,45 +65,48 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,0)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 0))
      
 (4) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
-Input: [key#x, val#x]
-Keys: [key#x]
-Functions: [partial_max(val#x)]
-Aggregate Attributes: [max#x]
-Results: [key#x, max#x]
+Input [2]: [key#x, val#x]
+Keys [1]: [key#x]
+Functions [1]: [partial_max(val#x)]
+Aggregate Attributes [1]: [max#x]
+Results [2]: [key#x, max#x]
      
 (6) Exchange 
-Input: [key#x, max#x]
-     
+Input [2]: [key#x, max#x]
+Arguments: hashpartitioning(key#x, 4), true, [id=#x]
+      
 (7) HashAggregate [codegen id : 2]
-Input: [key#x, max#x]
-Keys: [key#x]
-Functions: [max(val#x)]
-Aggregate Attributes: [max(val#x)#x]
-Results: [key#x, max(val#x)#x AS max(val)#x]
+Input [2]: [key#x, max#x]
+Keys [1]: [key#x]
+Functions [1]: [max(val#x)]
+Aggregate Attributes [1]: [max(val#x)#x]
+Results [2]: [key#x, max(val#x)#x AS max(val)#x]
      
 (8) Exchange 
-Input: [key#x, max(val)#x]
-     
+Input [2]: [key#x, max(val)#x]
+Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), true, [id=#x]
+      
 (9) Sort [codegen id : 3]
-Input: [key#x, max(val)#x]
+Input [2]: [key#x, max(val)#x]
+Arguments: [key#x ASC NULLS FIRST], true, 0
 
 
 -- !query
@@ -129,47 +132,48 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,0)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 0))
      
 (4) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
-Input: [key#x, val#x]
-Keys: [key#x]
-Functions: [partial_max(val#x)]
-Aggregate Attributes: [max#x]
-Results: [key#x, max#x]
+Input [2]: [key#x, val#x]
+Keys [1]: [key#x]
+Functions [1]: [partial_max(val#x)]
+Aggregate Attributes [1]: [max#x]
+Results [2]: [key#x, max#x]
      
 (6) Exchange 
-Input: [key#x, max#x]
-     
+Input [2]: [key#x, max#x]
+Arguments: hashpartitioning(key#x, 4), true, [id=#x]
+      
 (7) HashAggregate [codegen id : 2]
-Input: [key#x, max#x]
-Keys: [key#x]
-Functions: [max(val#x)]
-Aggregate Attributes: [max(val#x)#x]
-Results: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x]
+Input [2]: [key#x, max#x]
+Keys [1]: [key#x]
+Functions [1]: [max(val#x)]
+Aggregate Attributes [1]: [max(val#x)#x]
+Results [3]: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x]
      
 (8) Filter [codegen id : 2]
-Input     : [key#x, max(val)#x, max(val#x)#x]
+Input [3]: [key#x, max(val)#x, max(val#x)#x]
 Condition : (isnotnull(max(val#x)#x) AND (max(val#x)#x > 0))
      
 (9) Project [codegen id : 2]
-Output    : [key#x, max(val)#x]
-Input     : [key#x, max(val)#x, max(val#x)#x]
+Output [2]: [key#x, max(val)#x]
+Input [3]: [key#x, max(val)#x, max(val#x)#x]
 
 
 -- !query
@@ -196,59 +200,60 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,0)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 0))
      
 (4) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,0)]
 ReadSchema: struct<key:int,val:int>
      
 (6) ColumnarToRow [codegen id : 2]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (7) Filter [codegen id : 2]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 0))
      
 (8) Project [codegen id : 2]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (9) Union 
-     
+      
 (10) HashAggregate [codegen id : 3]
-Input: [key#x, val#x]
-Keys: [key#x, val#x]
+Input [2]: [key#x, val#x]
+Keys [2]: [key#x, val#x]
 Functions: []
 Aggregate Attributes: []
-Results: [key#x, val#x]
+Results [2]: [key#x, val#x]
      
 (11) Exchange 
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+Arguments: hashpartitioning(key#x, val#x, 4), true, [id=#x]
+      
 (12) HashAggregate [codegen id : 4]
-Input: [key#x, val#x]
-Keys: [key#x, val#x]
+Input [2]: [key#x, val#x]
+Keys [2]: [key#x, val#x]
 Functions: []
 Aggregate Attributes: []
-Results: [key#x, val#x]
+Results [2]: [key#x, val#x]
 
 
 -- !query
@@ -274,47 +279,48 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 2]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 2]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : isnotnull(key#x)
      
 (4) Project [codegen id : 2]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) Scan parquet default.explain_temp2 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp2]
 PushedFilters: [IsNotNull(key)]
 ReadSchema: struct<key:int,val:int>
      
 (6) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (7) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : isnotnull(key#x)
      
 (8) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (9) BroadcastExchange 
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x]
+      
 (10) BroadcastHashJoin [codegen id : 2]
-Left keys: List(key#x)
-Right keys: List(key#x)
+Left keys [1]: [key#x]
+Right keys [1]: [key#x]
 Join condition: None
 
 
@@ -339,38 +345,39 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 2]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Scan parquet default.explain_temp2 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp2]
 PushedFilters: [IsNotNull(key)]
 ReadSchema: struct<key:int,val:int>
      
 (4) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (5) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : isnotnull(key#x)
      
 (6) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (7) BroadcastExchange 
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x]
+      
 (8) BroadcastHashJoin [codegen id : 2]
-Left keys: List(key#x)
-Right keys: List(key#x)
+Left keys [1]: [key#x]
+Right keys [1]: [key#x]
 Join condition: None
 
 
@@ -396,22 +403,22 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), IsNotNull(val), GreaterThan(val,3)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (((isnotnull(key#x) AND isnotnull(val#x)) AND (key#x = Subquery scalar-subquery#x, [id=#x])) AND (val#x > 3))
      
 (4) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 ===== Subqueries =====
 
@@ -426,39 +433,40 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery
 
 
 (5) Scan parquet default.explain_temp2 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp2]
 PushedFilters: [IsNotNull(key), IsNotNull(val), EqualTo(val,2)]
 ReadSchema: struct<key:int,val:int>
      
 (6) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (7) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (((isnotnull(key#x) AND isnotnull(val#x)) AND (key#x = Subquery scalar-subquery#x, [id=#x])) AND (val#x = 2))
      
 (8) Project [codegen id : 1]
-Output    : [key#x]
-Input     : [key#x, val#x]
+Output [1]: [key#x]
+Input [2]: [key#x, val#x]
      
 (9) HashAggregate [codegen id : 1]
-Input: [key#x]
+Input [1]: [key#x]
 Keys: []
-Functions: [partial_max(key#x)]
-Aggregate Attributes: [max#x]
-Results: [max#x]
+Functions [1]: [partial_max(key#x)]
+Aggregate Attributes [1]: [max#x]
+Results [1]: [max#x]
      
 (10) Exchange 
-Input: [max#x]
-     
+Input [1]: [max#x]
+Arguments: SinglePartition, true, [id=#x]
+      
 (11) HashAggregate [codegen id : 2]
-Input: [max#x]
+Input [1]: [max#x]
 Keys: []
-Functions: [max(key#x)]
-Aggregate Attributes: [max(key#x)#x]
-Results: [max(key#x)#x AS max(key)#x]
+Functions [1]: [max(key#x)]
+Aggregate Attributes [1]: [max(key#x)#x]
+Results [1]: [max(key#x)#x AS max(key)#x]
      
 Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery#x, [id=#x]
 * HashAggregate (18)
@@ -471,39 +479,40 @@ Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery
 
 
 (12) Scan parquet default.explain_temp3 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp3]
 PushedFilters: [IsNotNull(val), GreaterThan(val,0)]
 ReadSchema: struct<key:int,val:int>
      
 (13) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (14) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(val#x) AND (val#x > 0))
      
 (15) Project [codegen id : 1]
-Output    : [key#x]
-Input     : [key#x, val#x]
+Output [1]: [key#x]
+Input [2]: [key#x, val#x]
      
 (16) HashAggregate [codegen id : 1]
-Input: [key#x]
+Input [1]: [key#x]
 Keys: []
-Functions: [partial_max(key#x)]
-Aggregate Attributes: [max#x]
-Results: [max#x]
+Functions [1]: [partial_max(key#x)]
+Aggregate Attributes [1]: [max#x]
+Results [1]: [max#x]
      
 (17) Exchange 
-Input: [max#x]
-     
+Input [1]: [max#x]
+Arguments: SinglePartition, true, [id=#x]
+      
 (18) HashAggregate [codegen id : 2]
-Input: [max#x]
+Input [1]: [max#x]
 Keys: []
-Functions: [max(key#x)]
-Aggregate Attributes: [max(key#x)#x]
-Results: [max(key#x)#x AS max(key)#x]
+Functions [1]: [max(key#x)]
+Aggregate Attributes [1]: [max(key#x)#x]
+Results [1]: [max(key#x)#x AS max(key)#x]
 
 
 -- !query
@@ -527,16 +536,16 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (cast(key#x as double) = Subquery scalar-subquery#x, [id=#x]))
      
 ===== Subqueries =====
@@ -552,39 +561,40 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery
 
 
 (4) Scan parquet default.explain_temp2 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp2]
 PushedFilters: [IsNotNull(val), GreaterThan(val,0)]
 ReadSchema: struct<key:int,val:int>
      
 (5) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (6) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(val#x) AND (val#x > 0))
      
 (7) Project [codegen id : 1]
-Output    : [key#x]
-Input     : [key#x, val#x]
+Output [1]: [key#x]
+Input [2]: [key#x, val#x]
      
 (8) HashAggregate [codegen id : 1]
-Input: [key#x]
+Input [1]: [key#x]
 Keys: []
-Functions: [partial_max(key#x)]
-Aggregate Attributes: [max#x]
-Results: [max#x]
+Functions [1]: [partial_max(key#x)]
+Aggregate Attributes [1]: [max#x]
+Results [1]: [max#x]
      
 (9) Exchange 
-Input: [max#x]
-     
+Input [1]: [max#x]
+Arguments: SinglePartition, true, [id=#x]
+      
 (10) HashAggregate [codegen id : 2]
-Input: [max#x]
+Input [1]: [max#x]
 Keys: []
-Functions: [max(key#x)]
-Aggregate Attributes: [max(key#x)#x]
-Results: [max(key#x)#x AS max(key)#x]
+Functions [1]: [max(key#x)]
+Aggregate Attributes [1]: [max(key#x)#x]
+Results [1]: [max(key#x)#x AS max(key)#x]
      
 Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x]
 * HashAggregate (17)
@@ -597,39 +607,40 @@ Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery
 
 
 (11) Scan parquet default.explain_temp3 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp3]
 PushedFilters: [IsNotNull(val), GreaterThan(val,0)]
 ReadSchema: struct<key:int,val:int>
      
 (12) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (13) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(val#x) AND (val#x > 0))
      
 (14) Project [codegen id : 1]
-Output    : [key#x]
-Input     : [key#x, val#x]
+Output [1]: [key#x]
+Input [2]: [key#x, val#x]
      
 (15) HashAggregate [codegen id : 1]
-Input: [key#x]
+Input [1]: [key#x]
 Keys: []
-Functions: [partial_avg(cast(key#x as bigint))]
-Aggregate Attributes: [sum#x, count#xL]
-Results: [sum#x, count#xL]
+Functions [1]: [partial_avg(cast(key#x as bigint))]
+Aggregate Attributes [2]: [sum#x, count#xL]
+Results [2]: [sum#x, count#xL]
      
 (16) Exchange 
-Input: [sum#x, count#xL]
-     
+Input [2]: [sum#x, count#xL]
+Arguments: SinglePartition, true, [id=#x]
+      
 (17) HashAggregate [codegen id : 2]
-Input: [sum#x, count#xL]
+Input [2]: [sum#x, count#xL]
 Keys: []
-Functions: [avg(cast(key#x as bigint))]
-Aggregate Attributes: [avg(cast(key#x as bigint))#x]
-Results: [avg(cast(key#x as bigint))#x AS avg(key)#x]
+Functions [1]: [avg(cast(key#x as bigint))]
+Aggregate Attributes [1]: [avg(cast(key#x as bigint))#x]
+Results [1]: [avg(cast(key#x as bigint))#x AS avg(key)#x]
 
 
 -- !query
@@ -653,10 +664,10 @@ ReadSchema: struct<>
      
 (2) ColumnarToRow [codegen id : 1]
 Input: []
-     
+      
 (3) Project [codegen id : 1]
-Output    : [(Subquery scalar-subquery#x, [id=#x] + ReusedSubquery Subquery scalar-subquery#x, [id=#x]) AS (scalarsubquery() + scalarsubquery())#x]
-Input     : []
+Output [1]: [(Subquery scalar-subquery#x, [id=#x] + ReusedSubquery Subquery scalar-subquery#x, [id=#x]) AS (scalarsubquery() + scalarsubquery())#x]
+Input: []
      
 ===== Subqueries =====
 
@@ -669,30 +680,31 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery
 
 
 (4) Scan parquet default.explain_temp1 
-Output: [key#x]
+Output [1]: [key#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 ReadSchema: struct<key:int>
      
 (5) ColumnarToRow [codegen id : 1]
-Input: [key#x]
-     
+Input [1]: [key#x]
+      
 (6) HashAggregate [codegen id : 1]
-Input: [key#x]
+Input [1]: [key#x]
 Keys: []
-Functions: [partial_avg(cast(key#x as bigint))]
-Aggregate Attributes: [sum#x, count#xL]
-Results: [sum#x, count#xL]
+Functions [1]: [partial_avg(cast(key#x as bigint))]
+Aggregate Attributes [2]: [sum#x, count#xL]
+Results [2]: [sum#x, count#xL]
      
 (7) Exchange 
-Input: [sum#x, count#xL]
-     
+Input [2]: [sum#x, count#xL]
+Arguments: SinglePartition, true, [id=#x]
+      
 (8) HashAggregate [codegen id : 2]
-Input: [sum#x, count#xL]
+Input [2]: [sum#x, count#xL]
 Keys: []
-Functions: [avg(cast(key#x as bigint))]
-Aggregate Attributes: [avg(cast(key#x as bigint))#x]
-Results: [avg(cast(key#x as bigint))#x AS avg(key)#x]
+Functions [1]: [avg(cast(key#x as bigint))]
+Aggregate Attributes [1]: [avg(cast(key#x as bigint))#x]
+Results [1]: [avg(cast(key#x as bigint))#x AS avg(key)#x]
      
 Subquery:2 Hosting operator id = 3 Hosting Expression = ReusedSubquery Subquery scalar-subquery#x, [id=#x]
 
@@ -722,47 +734,48 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,10)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 2]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 2]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 10))
      
 (4) Project [codegen id : 2]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,10)]
 ReadSchema: struct<key:int,val:int>
      
 (6) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (7) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 10))
      
 (8) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (9) BroadcastExchange 
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x]
+      
 (10) BroadcastHashJoin [codegen id : 2]
-Left keys: List(key#x)
-Right keys: List(key#x)
+Left keys [1]: [key#x]
+Right keys [1]: [key#x]
 Join condition: None
 
 
@@ -793,56 +806,58 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 PushedFilters: [IsNotNull(key), GreaterThan(key,10)]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Filter [codegen id : 1]
-Input     : [key#x, val#x]
+Input [2]: [key#x, val#x]
 Condition : (isnotnull(key#x) AND (key#x > 10))
      
 (4) Project [codegen id : 1]
-Output    : [key#x, val#x]
-Input     : [key#x, val#x]
+Output [2]: [key#x, val#x]
+Input [2]: [key#x, val#x]
      
 (5) HashAggregate [codegen id : 1]
-Input: [key#x, val#x]
-Keys: [key#x]
-Functions: [partial_max(val#x)]
-Aggregate Attributes: [max#x]
-Results: [key#x, max#x]
+Input [2]: [key#x, val#x]
+Keys [1]: [key#x]
+Functions [1]: [partial_max(val#x)]
+Aggregate Attributes [1]: [max#x]
+Results [2]: [key#x, max#x]
      
 (6) Exchange 
-Input: [key#x, max#x]
-     
+Input [2]: [key#x, max#x]
+Arguments: hashpartitioning(key#x, 4), true, [id=#x]
+      
 (7) HashAggregate [codegen id : 4]
-Input: [key#x, max#x]
-Keys: [key#x]
-Functions: [max(val#x)]
-Aggregate Attributes: [max(val#x)#x]
-Results: [key#x, max(val#x)#x AS max(val)#x]
+Input [2]: [key#x, max#x]
+Keys [1]: [key#x]
+Functions [1]: [max(val#x)]
+Aggregate Attributes [1]: [max(val#x)#x]
+Results [2]: [key#x, max(val#x)#x AS max(val)#x]
      
 (8) ReusedExchange  [Reuses operator id: 6]
-Output : ArrayBuffer(key#x, max#x)
+Output [2]: [key#x, max#x]
      
 (9) HashAggregate [codegen id : 3]
-Input: [key#x, max#x]
-Keys: [key#x]
-Functions: [max(val#x)]
-Aggregate Attributes: [max(val#x)#x]
-Results: [key#x, max(val#x)#x AS max(val)#x]
+Input [2]: [key#x, max#x]
+Keys [1]: [key#x]
+Functions [1]: [max(val#x)]
+Aggregate Attributes [1]: [max(val#x)#x]
+Results [2]: [key#x, max(val#x)#x AS max(val)#x]
      
 (10) BroadcastExchange 
-Input: [key#x, max(val)#x]
-     
+Input [2]: [key#x, max(val)#x]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x]
+      
 (11) BroadcastHashJoin [codegen id : 4]
-Left keys: List(key#x)
-Right keys: List(key#x)
+Left keys [1]: [key#x]
+Right keys [1]: [key#x]
 Join condition: None
 
 
@@ -862,12 +877,15 @@ Execute CreateViewCommand (1)
 
 (1) Execute CreateViewCommand 
 Output: []
-     
+      
 (2) CreateViewCommand 
-     
+Arguments: `explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView
+      
 (3) UnresolvedRelation 
-     
-(4) Project
+Arguments: [explain_temp1]
+      
+(4) Project 
+Arguments: ['key, 'val]
 
 
 -- !query
@@ -888,30 +906,31 @@ struct<plan:string>
 
 
 (1) Scan parquet default.explain_temp1 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp1]
 ReadSchema: struct<key:int,val:int>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) HashAggregate 
-Input: [key#x, val#x]
+Input [2]: [key#x, val#x]
 Keys: []
-Functions: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))]
-Aggregate Attributes: [count#xL, sum#xL, count#xL]
-Results: [count#xL, sum#xL, count#xL]
+Functions [3]: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))]
+Aggregate Attributes [3]: [count#xL, sum#xL, count#xL]
+Results [3]: [count#xL, sum#xL, count#xL]
      
 (4) Exchange 
-Input: [count#xL, sum#xL, count#xL]
-     
+Input [3]: [count#xL, sum#xL, count#xL]
+Arguments: SinglePartition, true, [id=#x]
+      
 (5) HashAggregate [codegen id : 2]
-Input: [count#xL, sum#xL, count#xL]
+Input [3]: [count#xL, sum#xL, count#xL]
 Keys: []
-Functions: [count(val#x), sum(cast(key#x as bigint)), count(key#x)]
-Aggregate Attributes: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL]
-Results: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL]
+Functions [3]: [count(val#x), sum(cast(key#x as bigint)), count(key#x)]
+Aggregate Attributes [3]: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL]
+Results [2]: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL]
 
 
 -- !query
@@ -931,30 +950,31 @@ ObjectHashAggregate (5)
 
 
 (1) Scan parquet default.explain_temp4 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp4]
 ReadSchema: struct<key:int,val:string>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) ObjectHashAggregate 
-Input: [key#x, val#x]
-Keys: [key#x]
-Functions: [partial_collect_set(val#x, 0, 0)]
-Aggregate Attributes: [buf#x]
-Results: [key#x, buf#x]
+Input [2]: [key#x, val#x]
+Keys [1]: [key#x]
+Functions [1]: [partial_collect_set(val#x, 0, 0)]
+Aggregate Attributes [1]: [buf#x]
+Results [2]: [key#x, buf#x]
      
 (4) Exchange 
-Input: [key#x, buf#x]
-     
+Input [2]: [key#x, buf#x]
+Arguments: hashpartitioning(key#x, 4), true, [id=#x]
+      
 (5) ObjectHashAggregate 
-Input: [key#x, buf#x]
-Keys: [key#x]
-Functions: [collect_set(val#x, 0, 0)]
-Aggregate Attributes: [collect_set(val#x, 0, 0)#x]
-Results: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x]
+Input [2]: [key#x, buf#x]
+Keys [1]: [key#x]
+Functions [1]: [collect_set(val#x, 0, 0)]
+Aggregate Attributes [1]: [collect_set(val#x, 0, 0)#x]
+Results [2]: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x]
 
 
 -- !query
@@ -976,36 +996,39 @@ SortAggregate (7)
 
 
 (1) Scan parquet default.explain_temp4 
-Output: [key#x, val#x]
+Output [2]: [key#x, val#x]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/explain_temp4]
 ReadSchema: struct<key:int,val:string>
      
 (2) ColumnarToRow [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+      
 (3) Sort [codegen id : 1]
-Input: [key#x, val#x]
-     
+Input [2]: [key#x, val#x]
+Arguments: [key#x ASC NULLS FIRST], false, 0
+      
 (4) SortAggregate 
-Input: [key#x, val#x]
-Keys: [key#x]
-Functions: [partial_min(val#x)]
-Aggregate Attributes: [min#x]
-Results: [key#x, min#x]
+Input [2]: [key#x, val#x]
+Keys [1]: [key#x]
+Functions [1]: [partial_min(val#x)]
+Aggregate Attributes [1]: [min#x]
+Results [2]: [key#x, min#x]
      
 (5) Exchange 
-Input: [key#x, min#x]
-     
+Input [2]: [key#x, min#x]
+Arguments: hashpartitioning(key#x, 4), true, [id=#x]
+      
 (6) Sort [codegen id : 2]
-Input: [key#x, min#x]
-     
+Input [2]: [key#x, min#x]
+Arguments: [key#x ASC NULLS FIRST], false, 0
+      
 (7) SortAggregate 
-Input: [key#x, min#x]
-Keys: [key#x]
-Functions: [min(val#x)]
-Aggregate Attributes: [min(val#x)#x]
-Results: [key#x, min(val#x)#x AS min(val)#x]
+Input [2]: [key#x, min#x]
+Keys [1]: [key#x]
+Functions [1]: [min(val#x)]
+Aggregate Attributes [1]: [min(val#x)#x]
+Results [2]: [key#x, min(val#x)#x AS min(val)#x]
 
 
 -- !query

From 274b328f57a626bb770c9831ae81034d060a7e1e Mon Sep 17 00:00:00 2001
From: sarthfrey-db <sarth.frey@databricks.com>
Date: Fri, 21 Feb 2020 11:40:28 -0800
Subject: [PATCH 118/185] [SPARK-30667][CORE] Add all gather method to
 BarrierTaskContext

Fix for #27395

### What changes were proposed in this pull request?

The `allGather` method is added to the `BarrierTaskContext`. This method contains the same functionality as the `BarrierTaskContext.barrier` method; it blocks the task until all tasks make the call, at which time they may continue execution. In addition, the `allGather` method takes an input message. Upon returning from the `allGather` the task receives a list of all the messages sent by all the tasks that made the `allGather` call.

### Why are the changes needed?

There are many situations where having the tasks communicate in a synchronized way is useful. One simple example is if each task needs to start a server to serve requests from one another; first the tasks must find a free port (the result of which is undetermined beforehand) and then start making requests, but to do so they each must know the port chosen by the other task. An `allGather` method would allow them to inform each other of the port they will run on.

### Does this PR introduce any user-facing change?

Yes, an `BarrierTaskContext.allGather` method will be available through the Scala, Java, and Python APIs.

### How was this patch tested?

Most of the code path is already covered by tests to the `barrier` method, since this PR includes a refactor so that much code is shared by the `barrier` and `allGather` methods. However, a test is added to assert that an all gather on each tasks partition ID will return a list of every partition ID.

An example through the Python API:
```python
>>> from pyspark import BarrierTaskContext
>>>
>>> def f(iterator):
...     context = BarrierTaskContext.get()
...     return [context.allGather('{}'.format(context.partitionId()))]
...
>>> sc.parallelize(range(4), 4).barrier().mapPartitions(f).collect()[0]
[u'3', u'1', u'0', u'2']
```

Closes #27640 from sarthfrey/master.

Lead-authored-by: sarthfrey-db <sarth.frey@databricks.com>
Co-authored-by: sarthfrey <sarth.frey@gmail.com>
Signed-off-by: Xingbo Jiang <xingbo.jiang@databricks.com>
---
 .../org/apache/spark/BarrierCoordinator.scala | 111 +++++++++++--
 .../org/apache/spark/BarrierTaskContext.scala | 153 ++++++++++++------
 .../spark/api/python/PythonRunner.scala       |  51 ++++--
 .../scheduler/BarrierTaskContextSuite.scala   |  77 +++++++++
 project/MimaExcludes.scala                    |   5 +-
 python/pyspark/taskcontext.py                 |  51 +++++-
 python/pyspark/tests/test_taskcontext.py      |  20 +++
 7 files changed, 388 insertions(+), 80 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
index 4e417679ca663..be5036e82e4b2 100644
--- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Timer, TimerTask}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.function.Consumer
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted}
@@ -99,10 +104,15 @@ private[spark] class BarrierCoordinator(
     // reset when a barrier() call fails due to timeout.
     private var barrierEpoch: Int = 0
 
-    // An array of RPCCallContexts for barrier tasks that are waiting for reply of a barrier()
-    // call.
+    // An Array of RPCCallContexts for barrier tasks that have made a blocking runBarrier() call
     private val requesters: ArrayBuffer[RpcCallContext] = new ArrayBuffer[RpcCallContext](numTasks)
 
+    // An Array of allGather messages for barrier tasks that have made a blocking runBarrier() call
+    private val allGatherMessages: ArrayBuffer[String] = new Array[String](numTasks).to[ArrayBuffer]
+
+    // The blocking requestMethod called by tasks to sync up for this stage attempt
+    private var requestMethodToSync: RequestMethod.Value = RequestMethod.BARRIER
+
     // A timer task that ensures we may timeout for a barrier() call.
     private var timerTask: TimerTask = null
 
@@ -130,9 +140,32 @@ private[spark] class BarrierCoordinator(
 
     // Process the global sync request. The barrier() call succeed if collected enough requests
     // within a configured time, otherwise fail all the pending requests.
-    def handleRequest(requester: RpcCallContext, request: RequestToSync): Unit = synchronized {
+    def handleRequest(
+      requester: RpcCallContext,
+      request: RequestToSync
+    ): Unit = synchronized {
       val taskId = request.taskAttemptId
       val epoch = request.barrierEpoch
+      val requestMethod = request.requestMethod
+      val partitionId = request.partitionId
+      val allGatherMessage = request match {
+        case ag: AllGatherRequestToSync => ag.allGatherMessage
+        case _ => ""
+      }
+
+      if (requesters.size == 0) {
+        requestMethodToSync = requestMethod
+      }
+
+      if (requestMethodToSync != requestMethod) {
+        requesters.foreach(
+          _.sendFailure(new SparkException(s"$barrierId tried to use requestMethod " +
+            s"`$requestMethod` during barrier epoch $barrierEpoch, which does not match " +
+            s"the current synchronized requestMethod `$requestMethodToSync`"
+          ))
+        )
+        cleanupBarrierStage(barrierId)
+      }
 
       // Require the number of tasks is correctly set from the BarrierTaskContext.
       require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " +
@@ -153,6 +186,7 @@ private[spark] class BarrierCoordinator(
         }
         // Add the requester to array of RPCCallContexts pending for reply.
         requesters += requester
+        allGatherMessages(partitionId) = allGatherMessage
         logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " +
           s"$taskId, current progress: ${requesters.size}/$numTasks.")
         if (maybeFinishAllRequesters(requesters, numTasks)) {
@@ -173,7 +207,13 @@ private[spark] class BarrierCoordinator(
         requesters: ArrayBuffer[RpcCallContext],
         numTasks: Int): Boolean = {
       if (requesters.size == numTasks) {
-        requesters.foreach(_.reply(()))
+        requestMethodToSync match {
+          case RequestMethod.BARRIER =>
+            requesters.foreach(_.reply(""))
+          case RequestMethod.ALL_GATHER =>
+            val json: String = compact(render(allGatherMessages))
+            requesters.foreach(_.reply(json))
+        }
         true
       } else {
         false
@@ -199,11 +239,11 @@ private[spark] class BarrierCoordinator(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) =>
+    case request: RequestToSync =>
       // Get or init the ContextBarrierState correspond to the stage attempt.
-      val barrierId = ContextBarrierId(stageId, stageAttemptId)
+      val barrierId = ContextBarrierId(request.stageId, request.stageAttemptId)
       states.computeIfAbsent(barrierId,
-        (key: ContextBarrierId) => new ContextBarrierState(key, numTasks))
+        (key: ContextBarrierId) => new ContextBarrierState(key, request.numTasks))
       val barrierState = states.get(barrierId)
 
       barrierState.handleRequest(context, request)
@@ -216,6 +256,16 @@ private[spark] class BarrierCoordinator(
 
 private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
 
+private[spark] sealed trait RequestToSync extends BarrierCoordinatorMessage {
+  def numTasks: Int
+  def stageId: Int
+  def stageAttemptId: Int
+  def taskAttemptId: Long
+  def barrierEpoch: Int
+  def partitionId: Int
+  def requestMethod: RequestMethod.Value
+}
+
 /**
  * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is
  * identified by stageId + stageAttemptId + barrierEpoch.
@@ -224,11 +274,44 @@ private[spark] sealed trait BarrierCoordinatorMessage extends Serializable
  * @param stageId ID of current stage
  * @param stageAttemptId ID of current stage attempt
  * @param taskAttemptId Unique ID of current task
- * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls.
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
  */
-private[spark] case class RequestToSync(
-    numTasks: Int,
-    stageId: Int,
-    stageAttemptId: Int,
-    taskAttemptId: Long,
-    barrierEpoch: Int) extends BarrierCoordinatorMessage
+private[spark] case class BarrierRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value
+) extends RequestToSync
+
+/**
+ * A global sync request message from BarrierTaskContext, by `allGather()` call. Each request is
+ * identified by stageId + stageAttemptId + barrierEpoch.
+ *
+ * @param numTasks The number of global sync requests the BarrierCoordinator shall receive
+ * @param stageId ID of current stage
+ * @param stageAttemptId ID of current stage attempt
+ * @param taskAttemptId Unique ID of current task
+ * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls
+ * @param partitionId ID of the current partition the task is assigned to
+ * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator
+ * @param allGatherMessage Message sent from the BarrierTaskContext if requestMethod is ALL_GATHER
+ */
+private[spark] case class AllGatherRequestToSync(
+  numTasks: Int,
+  stageId: Int,
+  stageAttemptId: Int,
+  taskAttemptId: Long,
+  barrierEpoch: Int,
+  partitionId: Int,
+  requestMethod: RequestMethod.Value,
+  allGatherMessage: String
+) extends RequestToSync
+
+private[spark] object RequestMethod extends Enumeration {
+  val BARRIER, ALL_GATHER = Value
+}
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
index 3d369802f3023..2263538a11676 100644
--- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -17,11 +17,19 @@
 
 package org.apache.spark
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{Properties, Timer, TimerTask}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.TimeoutException
 import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.executor.TaskMetrics
@@ -59,49 +67,31 @@ class BarrierTaskContext private[spark] (
   // from different tasks within the same barrier stage attempt to succeed.
   private lazy val numTasks = getTaskInfos().size
 
-  /**
-   * :: Experimental ::
-   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
-   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
-   * stage have reached this routine.
-   *
-   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
-   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
-   * timeout. Some examples of '''misuses''' are listed below:
-   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
-   * shall lead to timeout of the function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       if (context.partitionId() == 0) {
-   *           // Do nothing.
-   *       } else {
-   *           context.barrier()
-   *       }
-   *       iter
-   *   }
-   * }}}
-   *
-   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
-   * second function call.
-   * {{{
-   *   rdd.barrier().mapPartitions { iter =>
-   *       val context = BarrierTaskContext.get()
-   *       try {
-   *           // Do something that might throw an Exception.
-   *           doSomething()
-   *           context.barrier()
-   *       } catch {
-   *           case e: Exception => logWarning("...", e)
-   *       }
-   *       context.barrier()
-   *       iter
-   *   }
-   * }}}
-   */
-  @Experimental
-  @Since("2.4.0")
-  def barrier(): Unit = {
+  private def getRequestToSync(
+    numTasks: Int,
+    stageId: Int,
+    stageAttemptNumber: Int,
+    taskAttemptId: Long,
+    barrierEpoch: Int,
+    partitionId: Int,
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String
+  ): RequestToSync = {
+    requestMethod match {
+      case RequestMethod.BARRIER =>
+        BarrierRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod)
+      case RequestMethod.ALL_GATHER =>
+        AllGatherRequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
+          barrierEpoch, partitionId, requestMethod, allGatherMessage)
+    }
+  }
+
+  private def runBarrier(
+    requestMethod: RequestMethod.Value,
+    allGatherMessage: String = ""
+  ): String = {
+
     logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " +
       s"the global sync, current barrier epoch is $barrierEpoch.")
     logTrace("Current callSite: " + Utils.getCallSite())
@@ -118,10 +108,12 @@ class BarrierTaskContext private[spark] (
     // Log the update of global sync every 60 seconds.
     timer.schedule(timerTask, 60000, 60000)
 
+    var json: String = ""
+
     try {
-      val abortableRpcFuture = barrierCoordinator.askAbortable[Unit](
-        message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId,
-          barrierEpoch),
+      val abortableRpcFuture = barrierCoordinator.askAbortable[String](
+        message = getRequestToSync(numTasks, stageId, stageAttemptNumber,
+          taskAttemptId, barrierEpoch, partitionId, requestMethod, allGatherMessage),
         // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by
         // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework.
         timeout = new RpcTimeout(365.days, "barrierTimeout"))
@@ -133,7 +125,7 @@ class BarrierTaskContext private[spark] (
         while (!abortableRpcFuture.toFuture.isCompleted) {
           // wait RPC future for at most 1 second
           try {
-            ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
+            json = ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second)
           } catch {
             case _: TimeoutException | _: InterruptedException =>
               // If `TimeoutException` thrown, waiting RPC future reach 1 second.
@@ -163,6 +155,73 @@ class BarrierTaskContext private[spark] (
       timerTask.cancel()
       timer.purge()
     }
+    json
+  }
+
+  /**
+   * :: Experimental ::
+   * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to
+   * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same
+   * stage have reached this routine.
+   *
+   * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all
+   * possible code branches. Otherwise, you may get the job hanging or a SparkException after
+   * timeout. Some examples of '''misuses''' are listed below:
+   * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it
+   * shall lead to timeout of the function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       if (context.partitionId() == 0) {
+   *           // Do nothing.
+   *       } else {
+   *           context.barrier()
+   *       }
+   *       iter
+   *   }
+   * }}}
+   *
+   * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the
+   * second function call.
+   * {{{
+   *   rdd.barrier().mapPartitions { iter =>
+   *       val context = BarrierTaskContext.get()
+   *       try {
+   *           // Do something that might throw an Exception.
+   *           doSomething()
+   *           context.barrier()
+   *       } catch {
+   *           case e: Exception => logWarning("...", e)
+   *       }
+   *       context.barrier()
+   *       iter
+   *   }
+   * }}}
+   */
+  @Experimental
+  @Since("2.4.0")
+  def barrier(): Unit = {
+    runBarrier(RequestMethod.BARRIER)
+    ()
+  }
+
+  /**
+   * :: Experimental ::
+   * Blocks until all tasks in the same stage have reached this routine. Each task passes in
+   * a message and returns with a list of all the messages passed in by each of those tasks.
+   *
+   * CAUTION! The allGather method requires the same precautions as the barrier method
+   *
+   * The message is type String rather than Array[Byte] because it is more convenient for
+   * the user at the cost of worse performance.
+   */
+  @Experimental
+  @Since("3.0.0")
+  def allGather(message: String): ArrayBuffer[String] = {
+    val json = runBarrier(RequestMethod.ALL_GATHER, message)
+    val jsonArray = parse(json)
+    implicit val formats = DefaultFormats
+    ArrayBuffer(jsonArray.extract[Array[String]]: _*)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 658e0d593a167..fa8bf0fc06358 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,8 +24,13 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, render}
+
 import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
@@ -238,13 +243,18 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
                   sock.setSoTimeout(10000)
                   authHelper.authClient(sock)
                   val input = new DataInputStream(sock.getInputStream())
-                  input.readInt() match {
+                  val requestMethod = input.readInt()
+                  // The BarrierTaskContext function may wait infinitely, socket shall not timeout
+                  // before the function finishes.
+                  sock.setSoTimeout(0)
+                  requestMethod match {
                     case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
-                      // The barrier() function may wait infinitely, socket shall not timeout
-                      // before the function finishes.
-                      sock.setSoTimeout(0)
-                      barrierAndServe(sock)
-
+                      barrierAndServe(requestMethod, sock)
+                    case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+                      val length = input.readInt()
+                      val message = new Array[Byte](length)
+                      input.readFully(message)
+                      barrierAndServe(requestMethod, sock, new String(message, UTF_8))
                     case _ =>
                       val out = new DataOutputStream(new BufferedOutputStream(
                         sock.getOutputStream))
@@ -395,15 +405,31 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     }
 
     /**
-     * Gateway to call BarrierTaskContext.barrier().
+     * Gateway to call BarrierTaskContext methods.
      */
-    def barrierAndServe(sock: Socket): Unit = {
-      require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.")
-
+    def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = {
+      require(
+        serverSocket.isDefined,
+        "No available ServerSocket to redirect the BarrierTaskContext method call."
+      )
       val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream))
       try {
-        context.asInstanceOf[BarrierTaskContext].barrier()
-        writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out)
+        var result: String = ""
+        requestMethod match {
+          case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION =>
+            context.asInstanceOf[BarrierTaskContext].barrier()
+            result = BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS
+          case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION =>
+            val messages: ArrayBuffer[String] = context.asInstanceOf[BarrierTaskContext].allGather(
+              message
+            )
+            result = compact(render(JArray(
+              messages.map(
+                (message) => JString(message)
+              ).toList
+            )))
+        }
+        writeUTF(result, out)
       } catch {
         case e: SparkException =>
           writeUTF(e.getMessage, out)
@@ -638,6 +664,7 @@ private[spark] object SpecialLengths {
 
 private[spark] object BarrierTaskContextMessageProtocol {
   val BARRIER_FUNCTION = 1
+  val ALL_GATHER_FUNCTION = 2
   val BARRIER_RESULT_SUCCESS = "success"
   val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side."
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
index fc8ac38479932..33594c0a50d14 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.io.File
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 
 import org.apache.spark._
@@ -52,6 +53,82 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext {
     assert(times.max - times.min <= 1000)
   }
 
+  test("share messages with allGather() call") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message: String = context.partitionId().toString
+      val messages: ArrayBuffer[String] = context.allGather(message)
+      messages.toList.iterator
+    }
+    // Take a sorted list of all the partitionId messages
+    val messages = rdd2.collect().head
+    // All the task partitionIds are shared
+    for((x, i) <- messages.view.zipWithIndex) assert(x.toString == i.toString)
+  }
+
+  test("throw exception if we attempt to synchronize with different blocking calls") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      val partitionId = context.partitionId
+      if (partitionId == 0) {
+        context.barrier()
+      } else {
+        context.allGather(partitionId.toString)
+      }
+      Seq(null).iterator
+    }
+    val error = intercept[SparkException] {
+      rdd2.collect()
+    }.getMessage
+    assert(
+      error.contains("does not match the current synchronized requestMethod") ||
+      error.contains("not properly killed")
+    )
+  }
+
+  test("successively sync with allGather and barrier") {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[4, 1, 1024]")
+      .setAppName("test-cluster")
+    sc = new SparkContext(conf)
+    val rdd = sc.makeRDD(1 to 10, 4)
+    val rdd2 = rdd.barrier().mapPartitions { it =>
+      val context = BarrierTaskContext.get()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      context.barrier()
+      val time1 = System.currentTimeMillis()
+      // Sleep for a random time before global sync.
+      Thread.sleep(Random.nextInt(1000))
+      // Pass partitionId message in
+      val message = context.partitionId().toString
+      val messages = context.allGather(message)
+      val time2 = System.currentTimeMillis()
+      Seq((time1, time2)).iterator
+    }
+    val times = rdd2.collect()
+    // All the tasks shall finish the first round of global sync within a short time slot.
+    val times1 = times.map(_._1)
+    assert(times1.max - times1.min <= 1000)
+
+    // All the tasks shall finish the second round of global sync within a short time slot.
+    val times2 = times.map(_._2)
+    assert(times2.max - times2.min <= 1000)
+  }
+
   test("support multiple barrier() call within a single task") {
     initLocalClusterSparkContext()
     val rdd = sc.makeRDD(1 to 10, 4)
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 2a72284c011f5..289ccb2cf3664 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -495,7 +495,10 @@ object MimaExcludes {
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.AFTSurvivalRegression.setPredictionCol"),
 
     // [SPARK-29543][SS][UI] Init structured streaming ui
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStartedEvent.this")
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStartedEvent.this"),
+
+    // [SPARK-30667][CORE] Add allGather method to BarrierTaskContext
+    ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.RequestToSync")
   )
 
   // Exclude rules for 2.4.x
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index d648f63338514..e4fc64b732ba7 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -16,9 +16,10 @@
 #
 
 from __future__ import print_function
+import json
 
 from pyspark.java_gateway import local_connect_and_auth
-from pyspark.serializers import write_int, UTF8Deserializer
+from pyspark.serializers import write_int, write_with_length, UTF8Deserializer
 
 
 class TaskContext(object):
@@ -107,18 +108,28 @@ def resources(self):
 
 
 BARRIER_FUNCTION = 1
+ALL_GATHER_FUNCTION = 2
 
 
-def _load_from_socket(port, auth_secret):
+def _load_from_socket(port, auth_secret, function, all_gather_message=None):
     """
     Load data from a given socket, this is a blocking method thus only return when the socket
     connection has been closed.
     """
     (sockfile, sock) = local_connect_and_auth(port, auth_secret)
-    # The barrier() call may block forever, so no timeout
+
+    # The call may block forever, so no timeout
     sock.settimeout(None)
-    # Make a barrier() function call.
-    write_int(BARRIER_FUNCTION, sockfile)
+
+    if function == BARRIER_FUNCTION:
+        # Make a barrier() function call.
+        write_int(function, sockfile)
+    elif function == ALL_GATHER_FUNCTION:
+        # Make a all_gather() function call.
+        write_int(function, sockfile)
+        write_with_length(all_gather_message.encode("utf-8"), sockfile)
+    else:
+        raise ValueError("Unrecognized function type")
     sockfile.flush()
 
     # Collect result.
@@ -199,7 +210,35 @@ def barrier(self):
             raise Exception("Not supported to call barrier() before initialize " +
                             "BarrierTaskContext.")
         else:
-            _load_from_socket(self._port, self._secret)
+            _load_from_socket(self._port, self._secret, BARRIER_FUNCTION)
+
+    def allGather(self, message=""):
+        """
+        .. note:: Experimental
+
+        This function blocks until all tasks in the same stage have reached this routine.
+        Each task passes in a message and returns with a list of all the messages passed in
+        by each of those tasks.
+
+        .. warning:: In a barrier stage, each task much have the same number of `allGather()`
+            calls, in all possible code branches.
+            Otherwise, you may get the job hanging or a SparkException after timeout.
+
+        .. versionadded:: 3.0.0
+        """
+        if not isinstance(message, str):
+            raise ValueError("Argument `message` must be of type `str`")
+        elif self._port is None or self._secret is None:
+            raise Exception("Not supported to call barrier() before initialize " +
+                            "BarrierTaskContext.")
+        else:
+            gathered_items = _load_from_socket(
+                self._port,
+                self._secret,
+                ALL_GATHER_FUNCTION,
+                message,
+            )
+            return [e for e in json.loads(gathered_items)]
 
     def getTaskInfos(self):
         """
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 68cfe814762e0..752430a958391 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -135,6 +135,26 @@ def context_barrier(x):
         times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
         self.assertTrue(max(times) - min(times) < 1)
 
+    def test_all_gather(self):
+        """
+        Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
+        within a stage and passes messages properly.
+        """
+        rdd = self.sc.parallelize(range(10), 4)
+
+        def f(iterator):
+            yield sum(iterator)
+
+        def context_barrier(x):
+            tc = BarrierTaskContext.get()
+            time.sleep(random.randint(1, 10))
+            out = tc.allGather(str(tc.partitionId()))
+            pids = [int(e) for e in out]
+            return pids
+
+        pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
+        self.assertEqual(pids, [0, 1, 2, 3])
+
     def test_barrier_infos(self):
         """
         Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the

From 59d6d5cbb04adbd878282b04f0e4b8f7f775c2bd Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Sat, 22 Feb 2020 09:46:42 +0900
Subject: [PATCH 119/185] [SPARK-30840][CORE][SQL] Add version property for
 ConfigEntry and ConfigBuilder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
Spark `ConfigEntry` and `ConfigBuilder` missing Spark version information of each configuration at release. This is not good for Spark user when they visiting the page of spark configuration.
http://spark.apache.org/docs/latest/configuration.html
The new Spark SQL config docs looks like:
![sql配置截屏](https://user-images.githubusercontent.com/8486025/74604522-cb882f00-50f9-11ea-8683-57a90f9e3347.png)

```
> SET -v
spark.sql.adaptive.enabled      false   When true, enable adaptive query execution.
spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin       0.2     The relation with a non-empty partition ratio lower than this config will not be considered as the build side of a broadcast-hash join in adaptive execution regardless of its size.This configuration only has an effect when 'spark.sql.adaptive.enabled' is enabled.
spark.sql.adaptive.optimizeSkewedJoin.enabled   true    When true and adaptive execution is enabled, a skewed join is automatically handled at runtime.
spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionFactor     10      A partition is considered as a skewed partition if its size is larger than this factor multiple the median partition size and also larger than  spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionSizeThreshold
spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionMaxSplits  5       Configures the maximum number of task to handle a skewed partition in adaptive skewedjoin.
spark.sql.adaptive.optimizeSkewedJoin.skewedPartitionSizeThreshold      64MB    Configures the minimum size in bytes for a partition that is considered as a skewed partition in adaptive skewed join.
spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch.enabled    true    Whether to fetch the continuous shuffle blocks in batch. Instead of fetching blocks one by one, fetching continuous shuffle blocks for the same map task in batch can reduce IO and improve performance. Note, multiple continuous blocks exist in single fetch request only happen when 'spark.sql.adaptive.enabled' and 'spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled' is enabled, this feature also depends on a relocatable serializer, the concatenation support codec in use and the new version shuffle fetch protocol.
spark.sql.adaptive.shuffle.localShuffleReader.enabled   true    When true and 'spark.sql.adaptive.enabled' is enabled, this enables the optimization of converting the shuffle reader to local shuffle reader for the shuffle exchange of the broadcast hash join in probe side.
spark.sql.adaptive.shuffle.maxNumPostShufflePartitions  <undefined>     The advisory maximum number of post-shuffle partitions used in adaptive execution. This is used as the initial number of pre-shuffle partitions. By default it equals to spark.sql.shuffle.partitions. This configuration only has an effect when 'spark.sql.adaptive.enabled' and 'spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled' is enabled.
```

**Note**: Because there are so many configuration items that are exposed and require a lot of finishing, I will add the version numbers of these configuration items in another PR.

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
Exists UT

Closes #27592 from beliefer/add-version-to-config.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/internal/config/ConfigBuilder.scala | 16 ++++++---
 .../spark/internal/config/ConfigEntry.scala   | 35 +++++++++++++------
 .../apache/spark/sql/internal/SQLConf.scala   |  4 +--
 .../spark/sql/api/python/PythonSQLUtils.scala |  2 +-
 .../sql/execution/command/SetCommand.scala    | 11 ++++--
 .../spark/sql/internal/SQLConfSuite.scala     |  8 ++---
 sql/gen-sql-config-docs.py                    | 10 ++++--
 7 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
index 68e1994f0f94f..8d5959a0c8b7f 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
@@ -129,7 +129,7 @@ private[spark] class TypedConfigBuilder[T](
   def createOptional: OptionalConfigEntry[T] = {
     val entry = new OptionalConfigEntry[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, converter, stringConverter, parent._doc,
-      parent._public)
+      parent._public, parent._version)
     parent._onCreate.foreach(_(entry))
     entry
   }
@@ -144,7 +144,7 @@ private[spark] class TypedConfigBuilder[T](
       val transformedDefault = converter(stringConverter(default))
       val entry = new ConfigEntryWithDefault[T](parent.key, parent._prependedKey,
         parent._prependSeparator, parent._alternatives, transformedDefault, converter,
-        stringConverter, parent._doc, parent._public)
+        stringConverter, parent._doc, parent._public, parent._version)
       parent._onCreate.foreach(_(entry))
       entry
     }
@@ -154,7 +154,7 @@ private[spark] class TypedConfigBuilder[T](
   def createWithDefaultFunction(defaultFunc: () => T): ConfigEntry[T] = {
     val entry = new ConfigEntryWithDefaultFunction[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, defaultFunc, converter, stringConverter,
-      parent._doc, parent._public)
+      parent._doc, parent._public, parent._version)
     parent._onCreate.foreach(_ (entry))
     entry
   }
@@ -166,7 +166,7 @@ private[spark] class TypedConfigBuilder[T](
   def createWithDefaultString(default: String): ConfigEntry[T] = {
     val entry = new ConfigEntryWithDefaultString[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, default, converter, stringConverter,
-      parent._doc, parent._public)
+      parent._doc, parent._public, parent._version)
     parent._onCreate.foreach(_(entry))
     entry
   }
@@ -186,6 +186,7 @@ private[spark] case class ConfigBuilder(key: String) {
   private[config] var _prependSeparator: String = ""
   private[config] var _public = true
   private[config] var _doc = ""
+  private[config] var _version = ""
   private[config] var _onCreate: Option[ConfigEntry[_] => Unit] = None
   private[config] var _alternatives = List.empty[String]
 
@@ -199,6 +200,11 @@ private[spark] case class ConfigBuilder(key: String) {
     this
   }
 
+  def version(v: String): ConfigBuilder = {
+    _version = v
+    this
+  }
+
   /**
    * Registers a callback for when the config entry is finally instantiated. Currently used by
    * SQLConf to keep track of SQL configuration entries.
@@ -255,7 +261,7 @@ private[spark] case class ConfigBuilder(key: String) {
 
   def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = {
     val entry = new FallbackConfigEntry(key, _prependedKey, _prependSeparator, _alternatives, _doc,
-      _public, fallback)
+      _public, _version, fallback)
     _onCreate.foreach(_(entry))
     entry
   }
diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
index 1fa4f809e11ec..b98c7436f9906 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
@@ -68,6 +68,7 @@ The followings are best practices of naming configs for some common cases:
  * @param doc the documentation for the configuration
  * @param isPublic if this configuration is public to the user. If it's `false`, this
  *                 configuration is only used internally and we should not expose it to users.
+ * @param version the spark version when the configuration was released.
  * @tparam T the value type
  */
 private[spark] abstract class ConfigEntry[T] (
@@ -78,7 +79,8 @@ private[spark] abstract class ConfigEntry[T] (
     val valueConverter: String => T,
     val stringConverter: T => String,
     val doc: String,
-    val isPublic: Boolean) {
+    val isPublic: Boolean,
+    val version: String) {
 
   import ConfigEntry._
 
@@ -103,7 +105,8 @@ private[spark] abstract class ConfigEntry[T] (
   def defaultValue: Option[T] = None
 
   override def toString: String = {
-    s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, public=$isPublic)"
+    s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, " +
+      s"public=$isPublic, version=$version)"
   }
 }
 
@@ -116,7 +119,8 @@ private class ConfigEntryWithDefault[T] (
     valueConverter: String => T,
     stringConverter: T => String,
     doc: String,
-    isPublic: Boolean)
+    isPublic: Boolean,
+    version: String)
   extends ConfigEntry(
     key,
     prependedKey,
@@ -125,7 +129,8 @@ private class ConfigEntryWithDefault[T] (
     valueConverter,
     stringConverter,
     doc,
-    isPublic
+    isPublic,
+    version
   ) {
 
   override def defaultValue: Option[T] = Some(_defaultValue)
@@ -146,7 +151,8 @@ private class ConfigEntryWithDefaultFunction[T] (
     valueConverter: String => T,
     stringConverter: T => String,
     doc: String,
-    isPublic: Boolean)
+    isPublic: Boolean,
+    version: String)
   extends ConfigEntry(
     key,
     prependedKey,
@@ -155,7 +161,8 @@ private class ConfigEntryWithDefaultFunction[T] (
     valueConverter,
     stringConverter,
     doc,
-    isPublic
+    isPublic,
+    version
   ) {
 
   override def defaultValue: Option[T] = Some(_defaultFunction())
@@ -176,7 +183,8 @@ private class ConfigEntryWithDefaultString[T] (
     valueConverter: String => T,
     stringConverter: T => String,
     doc: String,
-    isPublic: Boolean)
+    isPublic: Boolean,
+    version: String)
   extends ConfigEntry(
     key,
     prependedKey,
@@ -185,7 +193,8 @@ private class ConfigEntryWithDefaultString[T] (
     valueConverter,
     stringConverter,
     doc,
-    isPublic
+    isPublic,
+    version
   ) {
 
   override def defaultValue: Option[T] = Some(valueConverter(_defaultValue))
@@ -210,7 +219,8 @@ private[spark] class OptionalConfigEntry[T](
     val rawValueConverter: String => T,
     val rawStringConverter: T => String,
     doc: String,
-    isPublic: Boolean)
+    isPublic: Boolean,
+    version: String)
   extends ConfigEntry[Option[T]](
     key,
     prependedKey,
@@ -219,7 +229,8 @@ private[spark] class OptionalConfigEntry[T](
     s => Some(rawValueConverter(s)),
     v => v.map(rawStringConverter).orNull,
     doc,
-    isPublic
+    isPublic,
+    version
   ) {
 
   override def defaultValueString: String = ConfigEntry.UNDEFINED
@@ -239,6 +250,7 @@ private[spark] class FallbackConfigEntry[T] (
     alternatives: List[String],
     doc: String,
     isPublic: Boolean,
+    version: String,
     val fallback: ConfigEntry[T])
   extends ConfigEntry[T](
     key,
@@ -248,7 +260,8 @@ private[spark] class FallbackConfigEntry[T] (
     fallback.valueConverter,
     fallback.stringConverter,
     doc,
-    isPublic
+    isPublic,
+    version
   ) {
 
   override def defaultValueString: String = s"<value of ${fallback.key}>"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7f3cbe31b5e02..e502a7f1fffc4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2881,10 +2881,10 @@ class SQLConf extends Serializable with Logging {
    * Return all the configuration definitions that have been defined in [[SQLConf]]. Each
    * definition contains key, defaultValue and doc.
    */
-  def getAllDefinedConfs: Seq[(String, String, String)] = sqlConfEntries.synchronized {
+  def getAllDefinedConfs: Seq[(String, String, String, String)] = sqlConfEntries.synchronized {
     sqlConfEntries.values.asScala.filter(_.isPublic).map { entry =>
       val displayValue = Option(getConfString(entry.key, null)).getOrElse(entry.defaultValueString)
-      (entry.key, displayValue, entry.doc)
+      (entry.key, displayValue, entry.doc, entry.version)
     }.toSeq
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
index bf3055d5e3e09..03f5a60aec438 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -40,7 +40,7 @@ private[sql] object PythonSQLUtils {
     FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
   }
 
-  def listSQLConfigs(): Array[(String, String, String)] = {
+  def listSQLConfigs(): Array[(String, String, String, String)] = {
     val conf = new SQLConf()
     // Py4J doesn't seem to translate Seq well, so we convert to an Array.
     conf.getAllDefinedConfs.toArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index 39b08e2894dcd..c55ff4ffefa02 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -115,14 +115,19 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
     case Some(("-v", None)) =>
       val runFunc = (sparkSession: SparkSession) => {
         sparkSession.sessionState.conf.getAllDefinedConfs.sorted.map {
-          case (key, defaultValue, doc) =>
-            Row(key, Option(defaultValue).getOrElse("<undefined>"), doc)
+          case (key, defaultValue, doc, version) =>
+            Row(
+              key,
+              Option(defaultValue).getOrElse("<undefined>"),
+              doc,
+              Option(version).getOrElse("<unknown>"))
         }
       }
       val schema = StructType(
         StructField("key", StringType, nullable = false) ::
           StructField("value", StringType, nullable = false) ::
-          StructField("meaning", StringType, nullable = false) :: Nil)
+          StructField("meaning", StringType, nullable = false) ::
+          StructField("Since version", StringType, nullable = false) :: Nil)
       (schema.toAttributes, runFunc)
 
     // Queries the deprecated "mapred.reduce.tasks" property.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 61be3672f3ebe..b540b9aa12011 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -285,8 +285,8 @@ class SQLConfSuite extends QueryTest with SharedSparkSession {
     assert(spark.sessionState.conf.getConfString(fallback.key, "lzo") === "lzo")
 
     val displayValue = spark.sessionState.conf.getAllDefinedConfs
-      .find { case (key, _, _) => key == fallback.key }
-      .map { case (_, v, _) => v }
+      .find { case (key, _, _, _) => key == fallback.key }
+      .map { case (_, v, _, _) => v }
       .get
     assert(displayValue === fallback.defaultValueString)
 
@@ -297,8 +297,8 @@ class SQLConfSuite extends QueryTest with SharedSparkSession {
     assert(spark.sessionState.conf.getConfString(fallback.key) === "lzo")
 
     val newDisplayValue = spark.sessionState.conf.getAllDefinedConfs
-      .find { case (key, _, _) => key == fallback.key }
-      .map { case (_, v, _) => v }
+      .find { case (key, _, _, _) => key == fallback.key }
+      .map { case (_, v, _, _) => v }
       .get
     assert(newDisplayValue === "lzo")
 
diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py
index 04f5a850c9980..98212ad373370 100644
--- a/sql/gen-sql-config-docs.py
+++ b/sql/gen-sql-config-docs.py
@@ -25,7 +25,7 @@
 from pyspark.java_gateway import launch_gateway
 
 SQLConfEntry = namedtuple(
-    "SQLConfEntry", ["name", "default", "description"])
+    "SQLConfEntry", ["name", "default", "description", "version"])
 
 
 def get_public_sql_configs(jvm):
@@ -34,6 +34,7 @@ def get_public_sql_configs(jvm):
             name=_sql_config._1(),
             default=_sql_config._2(),
             description=_sql_config._3(),
+            version=_sql_config._4()
         )
         for _sql_config in jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listSQLConfigs()
     ]
@@ -49,12 +50,13 @@ def generate_sql_configs_table(sql_configs, path):
 
     ```html
     <table class="table">
-    <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+    <tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 
     <tr>
         <td><code>spark.sql.adaptive.enabled</code></td>
         <td>false</td>
         <td><p>When true, enable adaptive query execution.</p></td>
+        <td>2.1.0</td>
     </tr>
 
     ...
@@ -68,7 +70,7 @@ def generate_sql_configs_table(sql_configs, path):
         f.write(dedent(
             """
             <table class="table">
-            <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+            <tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
             """
         ))
         for config in sorted(sql_configs, key=lambda x: x.name):
@@ -96,12 +98,14 @@ def generate_sql_configs_table(sql_configs, path):
                     <td><code>{name}</code></td>
                     <td>{default}</td>
                     <td>{description}</td>
+                    <td>{version}</td>
                 </tr>
                 """
                 .format(
                     name=config.name,
                     default=default,
                     description=markdown.markdown(config.description),
+                    version=config.version
                 )
             ))
         f.write("</table>\n")

From 310c14ac8d581b4d5864a846f4f5cbcde43e71f1 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Sat, 22 Feb 2020 09:48:10 +0900
Subject: [PATCH 120/185] [MINOR][SQL] Add a comment for `removedSQLConfigs`

### What changes were proposed in this pull request?
In the PR, I propose to explain in the description of `removedSQLConfigs` when removed SQL configs should NOT be placed to the map.

### Why are the changes needed?
To make the cases when SQL configs should be added to `removedSQLConfigs` more clear. Recently, `spark.sql.variable.substitute.depth` was removed from the map by #27646 because it contradicts to the condition described by the PR.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By `./dev/scalastyle`

Closes #27653 from MaxGekk/removedSQLConfigs-comment.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e502a7f1fffc4..faf9a3c7e1e39 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2262,6 +2262,10 @@ object SQLConf {
    * The map contains info about removed SQL configs. Keys are SQL config names,
    * map values contain extra information like the version in which the config was removed,
    * config's default value and a comment.
+   *
+   * Please, add a removed SQL configuration property here only when it affects behaviours.
+   * For example, `spark.sql.variable.substitute.depth` was not added as it virtually
+   * became no-op later. By this, it makes migrations to new Spark versions painless.
    */
   val removedSQLConfigs: Map[String, RemovedConfig] = {
     val configs = Seq(

From 9a2cec9b1ee3ba095461d9c07c9932107228d42e Mon Sep 17 00:00:00 2001
From: roland-ondeviceresearch <roland@ondeviceresearch.com>
Date: Sat, 22 Feb 2020 09:52:45 +0900
Subject: [PATCH 121/185] [MINOR][DOCS] Fix ForEachWriter Java example

### What changes were proposed in this pull request?
Structured streaming documentation example fix

### Why are the changes needed?
Currently the java example uses incorrect syntax

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
In IDE

Closes #27671 from roland1982/foreachwriter_java_example_fix.

Authored-by: roland-ondeviceresearch <roland@ondeviceresearch.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 docs/structured-streaming-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 9f4edd73c2b65..47d899431888b 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -2202,7 +2202,7 @@ streamingDatasetOfString.writeStream.foreach(
 In Java, you have to extend the class `ForeachWriter` ([docs](api/java/org/apache/spark/sql/ForeachWriter.html)).
 {% highlight java %}
 streamingDatasetOfString.writeStream().foreach(
-  new ForeachWriter[String] {
+  new ForeachWriter<String>() {
 
     @Override public boolean open(long partitionId, long version) {
       // Open connection

From bcce1b10408b5c516c936e8770a598ef415d71c3 Mon Sep 17 00:00:00 2001
From: Christian Stuart <christian.stuart@databricks.com>
Date: Fri, 21 Feb 2020 21:39:35 -0700
Subject: [PATCH 122/185] [SPARK-30904][SQL] Thrift RowBasedSet serialization
 throws NullPointerException on NULL BigDecimal

### What changes were proposed in this pull request?

This PR fixes SPARK-30904 by adding a null check.

### Why are the changes needed?

For HIVE_CLI_SERVICE_PROTOCOL_V5 and below, serialization fails on NULL-containing decimal columns, caused by a call to  `value.toPlainString()`, where `value` might be null. This null check fixes it.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

A test was added for serialization of NULL decimals for all HIVE_CLI_SERVICE_PROTOCOL versions.

Closes #27654 from CJStuart/SPARK-30904.

Authored-by: Christian Stuart <christian.stuart@databricks.com>
Signed-off-by: Yuming Wang <wgyumg@gmail.com>
---
 .../SparkThriftServerProtocolVersionsSuite.scala             | 5 +++++
 .../main/java/org/apache/hive/service/cli/ColumnValue.java   | 3 ++-
 .../main/java/org/apache/hive/service/cli/ColumnValue.java   | 3 ++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala
index a63b5dac0aac3..fd45e7a48c0eb 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala
@@ -192,6 +192,11 @@ class SparkThriftServerProtocolVersionsSuite extends HiveThriftJdbcTest {
         assert(rs.next())
         assert(rs.getBigDecimal(1) === new java.math.BigDecimal("1.00"))
       }
+      testExecuteStatementWithProtocolVersion(version,
+        "SELECT cast(null as decimal) ") { rs =>
+        assert(rs.next())
+        assert(rs.getBigDecimal(1) === null)
+      }
     }
 
     test(s"$version get string type") {
diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java
index 547c6056b4fe8..a770bea9c2aa6 100644
--- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java
+++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java
@@ -186,7 +186,8 @@ public static TColumnValue toTColumnValue(Type type, Object value) {
     case INTERVAL_DAY_TIME_TYPE:
       return stringValue((HiveIntervalDayTime) value);
     case DECIMAL_TYPE:
-      return stringValue(((BigDecimal)value).toPlainString());
+      String plainStr = value == null ? null : ((BigDecimal)value).toPlainString();
+      return stringValue(plainStr);
     case BINARY_TYPE:
       String strVal = value == null ? null : UTF8String.fromBytes((byte[])value).toString();
       return stringValue(strVal);
diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java
index f881ab159ea67..53f0465a056d8 100644
--- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java
+++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java
@@ -189,7 +189,8 @@ public static TColumnValue toTColumnValue(TypeDescriptor typeDescriptor, Object
     case INTERVAL_DAY_TIME_TYPE:
       return stringValue((HiveIntervalDayTime) value);
     case DECIMAL_TYPE:
-      return stringValue(((BigDecimal)value).toPlainString());
+      String plainStr = value == null ? null : ((BigDecimal)value).toPlainString();
+      return stringValue(plainStr);
     case BINARY_TYPE:
       String strVal = value == null ? null : UTF8String.fromBytes((byte[])value).toString();
       return stringValue(strVal);

From 25f5bfaa6e624da7f491e770a2383038fc6009e1 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Sun, 23 Feb 2020 09:52:54 +0900
Subject: [PATCH 123/185] [SPARK-30903][SQL] Fail fast on duplicate columns
 when analyze columns

<!--
Thanks for sending a pull request!  Here are some tips for you:
  1. If this is your first time, please read our contributor guidelines: https://spark.apache.org/contributing.html
  2. Ensure you have added or run the appropriate tests for your PR: https://spark.apache.org/developer-tools.html
  3. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP][SPARK-XXXX] Your PR title ...'.
  4. Be sure to keep the PR description updated to reflect all changes.
  5. Please write your PR title to summarize what this PR proposes.
  6. If possible, provide a concise example to reproduce the issue for a faster review.
  7. If you want to add a new configuration, please read the guideline first for naming configurations in
     'core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala'.
-->

### What changes were proposed in this pull request?
<!--
Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster reviews in your PR. See the examples below.
  1. If you refactor some codes with changing classes, showing the class hierarchy will help reviewers.
  2. If you fix some SQL features, you can provide some references of other DBMSes.
  3. If there is design documentation, please add the link.
  4. If there is a discussion in the mailing list, please add the link.
-->

Add new `CommandCheck` rule and fail fast when detects duplicate columns in `AnalyzeColumnCommand`.

### Why are the changes needed?
<!--
Please clarify why the changes are needed. For instance,
  1. If you propose a new API, clarify the use case for a new API.
  2. If you fix a bug, you can clarify why it is a bug.
-->

To avoid duplicate statistics computation for the same column in `AnalyzeColumnCommand`.

### Does this PR introduce any user-facing change?
<!--
If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
If no, write 'No'.
-->

Yes. User now get exception when input duplicate columns.

### How was this patch tested?
<!--
If tests were added, say they were added here. Please make sure to add some test cases that check the changes thoroughly including negative and positive cases if possible.
If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
If tests were not added, please describe why they were not added and/or why it was difficult to add.
-->

Added new test.

Closes #27651 from Ngone51/fail_on_dup_cols.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../sql/execution/command/CommandCheck.scala  | 38 +++++++++++++++++++
 .../internal/BaseSessionStateBuilder.scala    |  2 +
 .../spark/sql/StatisticsCollectionSuite.scala | 17 +++++++++
 .../sql/hive/HiveSessionStateBuilder.scala    |  2 +
 4 files changed, 59 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala
new file mode 100644
index 0000000000000..dedace4af4d14
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.util.SchemaUtils
+
+/**
+ * Checks legitimization of various execution commands.
+ */
+case class CommandCheck(conf: SQLConf) extends (LogicalPlan => Unit) {
+
+  override def apply(plan: LogicalPlan): Unit = {
+    plan.foreach {
+      case AnalyzeColumnCommand(_, colsOpt, allColumns) if !allColumns =>
+        colsOpt.foreach(SchemaUtils.checkColumnNameDuplication(
+          _, "in analyze columns.", conf.caseSensitiveAnalysis))
+
+      case _ =>
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 2137fe2d0a8ff..20e1b5694e952 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.execution.{ColumnarRule, QueryExecution, SparkOptimizer, SparkPlanner, SparkSqlParser}
 import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin
+import org.apache.spark.sql.execution.command.CommandCheck
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.v2.{TableCapabilityCheck, V2SessionCatalog}
 import org.apache.spark.sql.streaming.StreamingQueryManager
@@ -190,6 +191,7 @@ abstract class BaseSessionStateBuilder(
         PreReadCheck +:
         HiveOnlyCheck +:
         TableCapabilityCheck +:
+        CommandCheck(conf) +:
         customCheckRules
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index e9ceab6724659..30b15a8710d8c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -650,4 +650,21 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       }
     }
   }
+
+  Seq(true, false).foreach { caseSensitive =>
+    test(s"SPARK-30903: Fail fast on duplicate columns when analyze columns " +
+      s"- caseSensitive=$caseSensitive") {
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        val table = "test_table"
+        withTable(table) {
+          sql(s"CREATE TABLE $table (value string, name string) USING PARQUET")
+          val dupCol = if (caseSensitive) "value" else "VaLuE"
+          val errorMsg = intercept[AnalysisException] {
+            sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS value, name, $dupCol")
+          }.getMessage
+          assert(errorMsg.contains("Found duplicate column(s)"))
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 1858fbd910666..7ec52ba31c029 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{SparkOptimizer, SparkPlanner}
 import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin
+import org.apache.spark.sql.execution.command.CommandCheck
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.v2.TableCapabilityCheck
 import org.apache.spark.sql.hive.client.HiveClient
@@ -93,6 +94,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
       PreWriteCheck +:
         PreReadCheck +:
         TableCapabilityCheck +:
+        CommandCheck(conf) +:
         customCheckRules
   }
 

From 9c2eadc7268844d49ec41da818002c99bb56addf Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Sun, 23 Feb 2020 17:46:19 +0900
Subject: [PATCH 124/185] [SPARK-30844][SQL] Static partition should also
 follow StoreAssignmentPolicy when insert into table

### What changes were proposed in this pull request?

Make static partition also follows `StoreAssignmentPolicy` when insert into table:

if `StoreAssignmentPolicy=LEGACY`, using `Cast`;
if `StoreAssignmentPolicy=ANSI | STRIC`, using `AnsiCast`;

E.g., for the table `t` created by:

```
create table t(a int, b string) using parquet partitioned by (a)
```
and insert values with `StoreAssignmentPolicy=ANSI` using:
```
insert into t partition(a='ansi') values('ansi')
```

Before this PR:

```
+----+----+
|   b|   a|
+----+----+
|ansi|null|
+----+----+
```

After this PR, insert will fail by:
```
java.lang.NumberFormatException: invalid input syntax for type numeric: ansi
```

(It should be better if we could use `TableOutputResolver.checkField` to fully follow `StoreAssignmentPolicy`. But since we lost the data type of static partition's value at first place, it's hard to use `TableOutputResolver.checkField`.)

### Why are the changes needed?

I think we should follow `StoreAssignmentPolicy` when insert into table for any columns, including static partition.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Added new test.

Closes #27597 from Ngone51/fix-static-partition.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../datasources/DataSourceStrategy.scala      | 13 +++++++++++-
 .../sql/sources/DataSourceAnalysisSuite.scala | 10 +++++++--
 .../spark/sql/sources/InsertSuite.scala       | 21 +++++++++++++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index e3a0a0a6c34e5..2d902b5c43c44 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -104,7 +105,17 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
         None
       } else if (potentialSpecs.size == 1) {
         val partValue = potentialSpecs.head._2
-        Some(Alias(cast(Literal(partValue), field.dataType), field.name)())
+        conf.storeAssignmentPolicy match {
+          // SPARK-30844: try our best to follow StoreAssignmentPolicy for static partition
+          // values but not completely follow because we can't do static type checking due to
+          // the reason that the parser has erased the type info of static partition values
+          // and converted them to string.
+          case StoreAssignmentPolicy.ANSI | StoreAssignmentPolicy.STRICT =>
+            Some(Alias(AnsiCast(Literal(partValue), field.dataType,
+              Option(conf.sessionLocalTimeZone)), field.name)())
+          case _ =>
+            Some(Alias(cast(Literal(partValue), field.dataType), field.name)())
+        }
       } else {
         throw new AnalysisException(
           s"Partition column ${field.name} have multiple values specified, " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala
index e1022e377132c..a6c50904d395b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala
@@ -22,9 +22,10 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, Expression, Literal}
+import org.apache.spark.sql.catalyst.expressions.{Alias, AnsiCast, Attribute, Cast, Expression, Literal}
 import org.apache.spark.sql.execution.datasources.DataSourceAnalysis
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy
 import org.apache.spark.sql.types.{DataType, IntegerType, StructType}
 
 class DataSourceAnalysisSuite extends SparkFunSuite with BeforeAndAfterAll {
@@ -52,7 +53,12 @@ class DataSourceAnalysisSuite extends SparkFunSuite with BeforeAndAfterAll {
   Seq(true, false).foreach { caseSensitive =>
     val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive)
     def cast(e: Expression, dt: DataType): Expression = {
-      Cast(e, dt, Option(conf.sessionLocalTimeZone))
+      conf.storeAssignmentPolicy match {
+        case StoreAssignmentPolicy.ANSI | StoreAssignmentPolicy.STRICT =>
+          AnsiCast(e, dt, Option(conf.sessionLocalTimeZone))
+        case _ =>
+          Cast(e, dt, Option(conf.sessionLocalTimeZone))
+      }
     }
     val rule = DataSourceAnalysis(conf)
     test(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 0101803561c90..c81080ebe4d61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -754,6 +754,27 @@ class InsertSuite extends DataSourceTest with SharedSparkSession {
     }
   }
 
+  test("SPARK-30844: static partition should also follow StoreAssignmentPolicy") {
+    SQLConf.StoreAssignmentPolicy.values.foreach { policy =>
+      withSQLConf(
+        SQLConf.STORE_ASSIGNMENT_POLICY.key -> policy.toString) {
+        withTable("t") {
+          sql("create table t(a int, b string) using parquet partitioned by (a)")
+          policy match {
+            case SQLConf.StoreAssignmentPolicy.ANSI | SQLConf.StoreAssignmentPolicy.STRICT =>
+              val errorMsg = intercept[NumberFormatException] {
+                sql("insert into t partition(a='ansi') values('ansi')")
+              }.getMessage
+              assert(errorMsg.contains("invalid input syntax for type numeric: ansi"))
+            case SQLConf.StoreAssignmentPolicy.LEGACY =>
+              sql("insert into t partition(a='ansi') values('ansi')")
+              checkAnswer(sql("select * from t"), Row("ansi", null) :: Nil)
+          }
+        }
+      }
+    }
+  }
+
   test("SPARK-24860: dynamic partition overwrite specified per source without catalog table") {
     withTempPath { path =>
       Seq((1, 1), (2, 2)).toDF("i", "part")

From 7aa94ca9cb9d85249d854f42ccd22fc9f5ce1887 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Mon, 24 Feb 2020 11:14:54 +0800
Subject: [PATCH 125/185] [SPARK-30867][ML] Add FValueTest

### What changes were proposed in this pull request?
This is the very first PR for supporting continuous distribution features selectors.
It adds the algorithm to compute fvalue for continuous features and continuous labels. This algorithm will be used for FValueRegressionSelector.

### Why are the changes needed?
Current Spark only supports the selection of categorical features, while there are many requirements for the selection of continuous distribution features.

I will add two new selectors:

1. FValueRegressionSelector for continuous features and continuous labels.
2. ANOVAFValueClassificationSelector for continuous features and categorical labels.

I will use subtasks to add these two selectors:

add FValueRegressionSelector on scala side

- add FValueRegressionTest, this contains the algorithm to compute FValue
- add FValueRegressionSelector using the above algorithm
- add a common Selector, make FValueRegressionSelector  and ChisqSelector to extend common selector

add FValueRegressionSelector on python side
add samples and doc
do the same for ANOVAFValueClassificationSelector

### Does this PR introduce any user-facing change?
Yes.
```
/**
 * param dataset  DataFrame of continuous labels and continuous features.
 * param featuresCol  Name of features column in dataset, of type `Vector` (`VectorUDT`)
 * param labelCol  Name of label column in dataset, of any numerical type
 * return Array containing the SelectionTestResult for every feature against the label.
 */
SelectionTest.fValueRegressionTest(dataset: Dataset[_], featuresCol: String, labelCol: String)
```

### How was this patch tested?
Add Unit test.

Closes #27623 from huaxingao/spark-30867.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
---
 .../org/apache/spark/ml/stat/FValueTest.scala | 117 ++++++++++++
 .../spark/ml/stat/FValueTestSuite.scala       | 169 ++++++++++++++++++
 2 files changed, 286 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
new file mode 100644
index 0000000000000..f479aa0e310e9
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.commons.math3.distribution.FDistribution
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions._
+
+
+/**
+ * FValue test for continuous data.
+ */
+@Since("3.1.0")
+object FValueTest {
+
+  /** Used to construct output schema of tests */
+  private case class FValueResult(
+      pValues: Vector,
+      degreesOfFreedom: Array[Long],
+      fValues: Vector)
+
+  /**
+   * @param dataset  DataFrame of continuous labels and continuous features.
+   * @param featuresCol  Name of features column in dataset, of type `Vector` (`VectorUDT`)
+   * @param labelCol  Name of label column in dataset, of any numerical type
+   * @return DataFrame containing the test result for every feature against the label.
+   *         This DataFrame will contain a single Row with the following fields:
+   *          - `pValues: Vector`
+   *          - `degreesOfFreedom: Array[Int]`
+   *          - `fValues: Vector`
+   *         Each of these fields has one value per feature.
+   */
+  @Since("3.1.0")
+  def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = {
+
+    val spark = dataset.sparkSession
+    import spark.implicits._
+
+    SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
+    SchemaUtils.checkNumericType(dataset.schema, labelCol)
+
+    val Row(xMeans: Vector, xStd: Vector, yMean: Double, yStd: Double, count: Long) = dataset
+      .select(Summarizer.metrics("mean", "std", "count").summary(col(featuresCol)).as("summary"),
+        avg(col(labelCol)).as("yMean"),
+        stddev(col(labelCol)).as("yStd"))
+      .select("summary.mean", "summary.std", "yMean", "yStd", "summary.count")
+      .first()
+
+    val labeledPointRdd = dataset.select(col("label").cast("double"), col("features"))
+      .as[(Double, Vector)].rdd
+
+    val numFeatures = xMeans.size
+    val numSamples = count
+    val degreesOfFreedom = numSamples - 2
+
+    // Use two pass equation Cov[X,Y] = E[(X - E[X]) * (Y - E[Y])] to compute covariance because
+    // one pass equation Cov[X,Y] = E[XY] - E[X]E[Y] is susceptible to catastrophic cancellation
+    // sumForCov = Sum(((Xi - Avg(X)) * ((Yi-Avg(Y)))
+    val sumForCov = labeledPointRdd.mapPartitions { iter =>
+      if (iter.hasNext) {
+        val array = Array.ofDim[Double](numFeatures)
+        while (iter.hasNext) {
+          val (label, features) = iter.next
+          val yDiff = label - yMean
+          if (yDiff != 0) {
+            features.iterator.zip(xMeans.iterator)
+              .foreach { case ((col, x), (_, xMean)) => array(col) += yDiff * (x - xMean) }
+          }
+        }
+        Iterator.single(array)
+      } else Iterator.empty
+    }.treeReduce { case (array1, array2) =>
+      var i = 0
+      while (i < numFeatures) {
+        array1(i) += array2(i)
+        i += 1
+      }
+      array1
+    }
+
+    val pValues = Array.ofDim[Double](numFeatures)
+    val degreesOfFreedoms = Array.fill(numFeatures)(degreesOfFreedom)
+    val fValues = Array.ofDim[Double](numFeatures)
+
+    val fd = new FDistribution(1, degreesOfFreedom)
+    for (i <- 0 until numFeatures) {
+      // Cov(X,Y) = Sum(((Xi - Avg(X)) * ((Yi-Avg(Y))) / (N-1)
+      val covariance = sumForCov (i) / (numSamples - 1)
+      val corr = covariance / (yStd * xStd(i))
+      val fValue = corr * corr / (1 - corr * corr) * degreesOfFreedom
+      pValues(i) = 1.0 - fd.cumulativeProbability(fValue)
+      fValues(i) = fValue
+    }
+
+    spark.createDataFrame(
+      Seq(FValueResult(Vectors.dense(pValues), degreesOfFreedoms, Vectors.dense(fValues))))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala
new file mode 100644
index 0000000000000..540c4fb0b63dc
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class FValueTestSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("test DataFrame of labeled points") {
+  // scalastyle:off
+  /*
+  Use the following sklearn data in this test
+
+  >>> from sklearn.feature_selection import f_regression
+  >>> import numpy as np
+  >>> X = np.random.rand(20, 6)
+  >>> w = np.array([0.3, 0.4, 0.5, 0, 0, 0])
+  >>> y = X @ w
+  >>> X
+  array([[1.67318514e-01, 1.78398028e-01, 4.36846538e-01, 5.24003164e-01,
+          1.80915415e-01, 1.98030859e-01],
+         [3.71836586e-01, 6.13453963e-01, 7.15269190e-01, 9.33623792e-03,
+          5.36095674e-01, 2.74223333e-01],
+         [3.68988949e-01, 5.34104018e-01, 5.24858744e-01, 6.86815853e-01,
+          3.26534757e-01, 6.92699400e-01],
+         [4.87748505e-02, 3.07080315e-01, 7.82955385e-01, 6.90167375e-01,
+          6.44077919e-01, 4.23739024e-01],
+         [6.50153455e-01, 8.32746110e-01, 6.88029140e-03, 1.27859556e-01,
+          6.80223767e-01, 6.25825675e-01],
+         [9.47343271e-01, 2.13193978e-01, 3.71342472e-01, 8.21291956e-01,
+          4.38195693e-01, 5.76569439e-01],
+         [9.96499254e-01, 8.45833297e-01, 6.56086922e-02, 5.90029174e-01,
+          1.68954572e-01, 7.19792823e-02],
+         [1.85926914e-01, 9.60329804e-01, 3.13487406e-01, 9.59549928e-01,
+          6.89093311e-01, 6.94999427e-01],
+         [9.40164576e-01, 2.69042714e-02, 5.39491321e-01, 5.74068666e-01,
+          1.10935343e-01, 2.17519760e-01],
+         [2.97951848e-02, 1.06592106e-01, 5.74931856e-01, 8.80801522e-01,
+          8.60445070e-01, 9.22757966e-01],
+         [9.80970473e-01, 3.05909353e-01, 4.96401766e-01, 2.44342697e-01,
+          6.90559227e-01, 5.64858704e-01],
+         [1.55939260e-01, 2.18626853e-01, 5.01834270e-01, 1.86694987e-01,
+          9.15411148e-01, 6.40527848e-01],
+         [3.16107608e-01, 9.25906358e-01, 5.47327167e-01, 4.83712979e-01,
+          8.42305220e-01, 7.58488462e-01],
+         [4.14393503e-01, 1.30817883e-01, 5.62034942e-01, 1.05150633e-01,
+          5.35632795e-01, 9.47594074e-04],
+         [5.26233981e-01, 7.63781419e-02, 3.19188240e-01, 5.16528633e-02,
+          5.28416724e-01, 6.47050470e-03],
+         [2.73404764e-01, 7.17070744e-01, 3.12889595e-01, 8.39271965e-01,
+          9.67650889e-01, 8.50098873e-01],
+         [4.63289495e-01, 3.57055416e-02, 5.43528596e-01, 4.44840919e-01,
+          9.36845855e-02, 7.81595037e-01],
+         [3.21784993e-01, 3.15622454e-01, 7.58870408e-01, 5.18198558e-01,
+          2.28151905e-01, 4.42460325e-01],
+         [3.72428352e-01, 1.44447969e-01, 8.40274188e-01, 5.86308041e-01,
+          6.09893953e-01, 3.97006473e-01],
+         [3.12776786e-01, 9.33630195e-01, 2.29328749e-01, 4.32807208e-01,
+          1.51703470e-02, 1.51589320e-01]])
+  >>> y
+  array([0.33997803, 0.71456716, 0.58676766, 0.52894227, 0.53158463,
+         0.55515181, 0.67008744, 0.5966537 , 0.56255674, 0.33904133,
+         0.66485577, 0.38514965, 0.73885841, 0.45766267, 0.34801557,
+         0.52529452, 0.42503336, 0.60221968, 0.58964479, 0.58194949])
+  >>> f_regression(X, y)
+  (array([2.76445780e+00, 1.05267800e+01, 4.43399092e-02, 2.04580501e-02,
+         3.13208557e-02, 1.35248025e-03]), array([0.11369388, 0.0044996 , 0.83558782, 0.88785417, 0.86150261,
+         0.97106833]))
+  */
+  // scalastyle:on
+
+    val data = Seq(
+      LabeledPoint(0.33997803, Vectors.dense(1.67318514e-01, 1.78398028e-01, 4.36846538e-01,
+        5.24003164e-01, 1.80915415e-01, 1.98030859e-01)),
+      LabeledPoint(0.71456716, Vectors.dense(3.71836586e-01, 6.13453963e-01, 7.15269190e-01,
+        9.33623792e-03, 5.36095674e-01, 2.74223333e-01)),
+      LabeledPoint(0.58676766, Vectors.dense(3.68988949e-01, 5.34104018e-01, 5.24858744e-01,
+        6.86815853e-01, 3.26534757e-01, 6.92699400e-01)),
+      LabeledPoint(0.52894227, Vectors.dense(4.87748505e-02, 3.07080315e-01, 7.82955385e-01,
+        6.90167375e-01, 6.44077919e-01, 4.23739024e-01)),
+      LabeledPoint(0.53158463, Vectors.dense(6.50153455e-01, 8.32746110e-01, 6.88029140e-03,
+        1.27859556e-01, 6.80223767e-01, 6.25825675e-01)),
+      LabeledPoint(0.55515181, Vectors.dense(9.47343271e-01, 2.13193978e-01, 3.71342472e-01,
+        8.21291956e-01, 4.38195693e-01, 5.76569439e-01)),
+      LabeledPoint(0.67008744, Vectors.dense(9.96499254e-01, 8.45833297e-01, 6.56086922e-02,
+        5.90029174e-01, 1.68954572e-01, 7.19792823e-02)),
+      LabeledPoint(0.5966537, Vectors.dense(1.85926914e-01, 9.60329804e-01, 3.13487406e-01,
+        9.59549928e-01, 6.89093311e-01, 6.94999427e-01)),
+      LabeledPoint(0.56255674, Vectors.dense(9.40164576e-01, 2.69042714e-02, 5.39491321e-01,
+        5.74068666e-01, 1.10935343e-01, 2.17519760e-01)),
+      LabeledPoint(0.33904133, Vectors.dense(2.97951848e-02, 1.06592106e-01, 5.74931856e-01,
+        8.80801522e-01, 8.60445070e-01, 9.22757966e-01)),
+      LabeledPoint(0.66485577, Vectors.dense(9.80970473e-01, 3.05909353e-01, 4.96401766e-01,
+        2.44342697e-01, 6.90559227e-01, 5.64858704e-01)),
+      LabeledPoint(0.38514965, Vectors.dense(1.55939260e-01, 2.18626853e-01, 5.01834270e-01,
+        1.86694987e-01, 9.15411148e-01, 6.40527848e-01)),
+      LabeledPoint(0.73885841, Vectors.dense(3.16107608e-01, 9.25906358e-01, 5.47327167e-01,
+        4.83712979e-01, 8.42305220e-01, 7.58488462e-01)),
+      LabeledPoint(0.45766267, Vectors.dense(4.14393503e-01, 1.30817883e-01, 5.62034942e-01,
+        1.05150633e-01, 5.35632795e-01, 9.47594074e-04)),
+      LabeledPoint(0.34801557, Vectors.dense(5.26233981e-01, 7.63781419e-02, 3.19188240e-01,
+        5.16528633e-02, 5.28416724e-01, 6.47050470e-03)),
+      LabeledPoint(0.52529452, Vectors.dense(2.73404764e-01, 7.17070744e-01, 3.12889595e-01,
+        8.39271965e-01, 9.67650889e-01, 8.50098873e-01)),
+      LabeledPoint(0.42503336, Vectors.dense(4.63289495e-01, 3.57055416e-02, 5.43528596e-01,
+        4.44840919e-01, 9.36845855e-02, 7.81595037e-01)),
+      LabeledPoint(0.60221968, Vectors.dense(3.21784993e-01, 3.15622454e-01, 7.58870408e-01,
+        5.18198558e-01, 2.28151905e-01, 4.42460325e-01)),
+      LabeledPoint(0.58964479, Vectors.dense(3.72428352e-01, 1.44447969e-01, 8.40274188e-01,
+        5.86308041e-01, 6.09893953e-01, 3.97006473e-01)),
+      LabeledPoint(0.58194949, Vectors.dense(3.12776786e-01, 9.33630195e-01, 2.29328749e-01,
+        4.32807208e-01, 1.51703470e-02, 1.51589320e-01)))
+
+    for (numParts <- List(2, 4, 6, 8)) {
+      val df = spark.createDataFrame(sc.parallelize(data, numParts))
+      val fRegression = FValueTest.test(df, "features", "label")
+      val (pValues: Vector, fValues: Vector) =
+        fRegression.select("pValues", "fValues")
+          .as[(Vector, Vector)].head()
+      assert(pValues ~== Vectors.dense(0.11369388, 0.0044996, 0.83558782, 0.88785417, 0.86150261,
+        0.97106833) relTol 1e-6)
+      assert(fValues ~== Vectors.dense(2.76445780e+00, 1.05267800e+01, 4.43399092e-02,
+        2.04580501e-02, 3.13208557e-02, 1.35248025e-03) relTol 1e-6)
+    }
+  }
+
+  test("test DataFrame with sparse vector") {
+    val df = spark.createDataFrame(Seq(
+      (4.6, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      (6.6, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      (5.1, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      (7.6, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      (9.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      (9.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
+    )).toDF("label", "features")
+
+    val fRegression = FValueTest.test(df, "features", "label")
+    val (pValues: Vector, fValues: Vector) =
+      fRegression.select("pValues", "fValues")
+        .as[(Vector, Vector)].head()
+    assert(pValues ~== Vectors.dense(0.35236913, 0.19167161, 0.06506426, 0.75183662, 0.16111045,
+      0.89090362) relTol 1e-6)
+    assert(fValues ~== Vectors.dense(1.10558422, 2.46254817, 6.37164347, 0.1147488, 2.94816821,
+      0.02134755) relTol 1e-6)
+  }
+}

From c41ef3981953a9d59030ea5f29386804b2d560a6 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Mon, 24 Feb 2020 14:06:25 +0800
Subject: [PATCH 126/185] [SPARK-30925][SQL] Prevent overflow/round errors in
 conversions of milliseconds to/from microseconds

### What changes were proposed in this pull request?
- Use `Math.multiplyExact()` in `DateTimeUtils.fromMillis()` to prevent silent overflow in conversion milliseconds to microseconds.
- Use `DateTimeUtils.fromMillis()` in all places where milliseconds are converted to microseconds
- Use `DateTimeUtils.toMillis()` in all places where microseconds are converted to milliseconds

### Why are the changes needed?

1. To prevent silent arithmetic overflow while multiplying by 1000 in `fromMillis()`. Instead of it, `new ArithmeticException("long overflow")` will be thrown, and handled accordantly.
2. To correctly round microseconds in conversion to milliseconds. For example, `1965-01-01 10:11:12.123456` is represented as `-157700927876544` in micro precision. In milliseconds precision the above needs to be represented as `-157700927877` or `1965-01-01 10:11:12.123`.

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
By `TimestampFormatterSuite`, `CastSuite`, `DateExpressionsSuite`, `IntervalExpressionsSuite`, `ExpressionParserSuite`, `ExpressionParserSuite`, `DateTimeUtilsSuite`, `IntervalUtilsSuite`

Closes #27676 from MaxGekk/millis-2-micros-overflow.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    | 14 +++++++-------
 .../spark/sql/catalyst/util/IntervalUtils.scala    |  5 ++---
 .../sql/catalyst/util/TimestampFormatter.scala     |  6 +++---
 .../spark/sql/catalyst/expressions/CastSuite.scala |  5 ++---
 .../expressions/DateExpressionsSuite.scala         |  3 +--
 .../expressions/IntervalExpressionsSuite.scala     |  3 ++-
 .../catalyst/parser/ExpressionParserSuite.scala    |  5 ++---
 .../sql/catalyst/util/DateTimeTestUtils.scala      |  9 +++++++++
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     |  9 ++++-----
 .../sql/catalyst/util/IntervalUtilsSuite.scala     |  5 +++--
 .../streaming/EventTimeWatermarkExec.scala         |  4 ++--
 .../spark/sql/execution/streaming/Triggers.scala   |  4 +++-
 .../spark/sql/StatisticsCollectionTestBase.scala   |  6 +++---
 13 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 593bd18f3de9c..2e4daa20ad51b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -65,7 +65,7 @@ object DateTimeUtils {
   }
 
   def millisToDays(millisUtc: Long, zoneId: ZoneId): SQLDate = {
-    val instant = microsToInstant(Math.multiplyExact(millisUtc, MICROS_PER_MILLIS))
+    val instant = microsToInstant(fromMillis(millisUtc))
     localDateToDays(LocalDateTime.ofInstant(instant, zoneId).toLocalDate)
   }
 
@@ -76,7 +76,7 @@ object DateTimeUtils {
 
   def daysToMillis(days: SQLDate, zoneId: ZoneId): Long = {
     val instant = daysToLocalDate(days).atStartOfDay(zoneId).toInstant
-    instantToMicros(instant) / MICROS_PER_MILLIS
+    toMillis(instantToMicros(instant))
   }
 
   // Converts Timestamp to string according to Hive TimestampWritable convention.
@@ -149,7 +149,7 @@ object DateTimeUtils {
    * Converts milliseconds since epoch to SQLTimestamp.
    */
   def fromMillis(millis: Long): SQLTimestamp = {
-    MILLISECONDS.toMicros(millis)
+    Math.multiplyExact(millis, MICROS_PER_MILLIS)
   }
 
   def microsToEpochDays(epochMicros: SQLTimestamp, zoneId: ZoneId): SQLDate = {
@@ -574,8 +574,8 @@ object DateTimeUtils {
       time2: SQLTimestamp,
       roundOff: Boolean,
       zoneId: ZoneId): Double = {
-    val millis1 = MICROSECONDS.toMillis(time1)
-    val millis2 = MICROSECONDS.toMillis(time2)
+    val millis1 = toMillis(time1)
+    val millis2 = toMillis(time2)
     val date1 = millisToDays(millis1, zoneId)
     val date2 = millisToDays(millis2, zoneId)
     val (year1, monthInYear1, dayInMonth1, daysToMonthEnd1) = splitDate(date1)
@@ -714,7 +714,7 @@ object DateTimeUtils {
       case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS)
       case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS)
       case _ =>
-        val millis = MICROSECONDS.toMillis(t)
+        val millis = toMillis(t)
         val truncated = level match {
           case TRUNC_TO_MILLISECOND => millis
           case TRUNC_TO_SECOND =>
@@ -725,7 +725,7 @@ object DateTimeUtils {
             val dDays = millisToDays(millis, zoneId)
             daysToMillis(truncDate(dDays, level), zoneId)
         }
-        truncated * MICROS_PER_MILLIS
+        fromMillis(truncated)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
index 2d98384363323..2790f8e998358 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.fromMillis
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.Decimal
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -704,9 +705,7 @@ object IntervalUtils {
                   microseconds = Math.addExact(microseconds, minutesUs)
                   i += minuteStr.numBytes()
                 } else if (s.matchAt(millisStr, i)) {
-                  val millisUs = Math.multiplyExact(
-                    currentValue,
-                    MICROS_PER_MILLIS)
+                  val millisUs = fromMillis(currentValue)
                   microseconds = Math.addExact(microseconds, millisUs)
                   i += millisStr.numBytes()
                 } else if (s.matchAt(microsStr, i)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index 4893a7ec91cbb..fd77326d6b016 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit.SECONDS
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.{ convertSpecialTimestamp, SQLTimestamp}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.Decimal
 
@@ -141,7 +141,7 @@ class LegacyFastTimestampFormatter(
     }
     val micros = cal.getMicros()
     cal.set(Calendar.MILLISECOND, 0)
-    cal.getTimeInMillis * MICROS_PER_MILLIS + micros
+    Math.addExact(fromMillis(cal.getTimeInMillis), micros)
   }
 
   def format(timestamp: SQLTimestamp): String = {
@@ -164,7 +164,7 @@ class LegacySimpleTimestampFormatter(
   }
 
   override def parse(s: String): Long = {
-    sdf.parse(s).getTime * MICROS_PER_MILLIS
+    fromMillis(sdf.parse(s).getTime)
   }
 
   override def format(us: Long): String = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index ad66873c02518..bde95f0e3add5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 import java.util.{Calendar, TimeZone}
-import java.util.concurrent.TimeUnit._
 
 import scala.collection.parallel.immutable.ParVector
 
@@ -272,13 +271,13 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(
         cast(cast(new Timestamp(c.getTimeInMillis), StringType, timeZoneId),
           TimestampType, timeZoneId),
-        MILLISECONDS.toMicros(c.getTimeInMillis))
+        fromMillis(c.getTimeInMillis))
       c = Calendar.getInstance(TimeZoneGMT)
       c.set(2015, 10, 1, 2, 30, 0)
       checkEvaluation(
         cast(cast(new Timestamp(c.getTimeInMillis), StringType, timeZoneId),
           TimestampType, timeZoneId),
-        MILLISECONDS.toMicros(c.getTimeInMillis))
+        fromMillis(c.getTimeInMillis))
     }
 
     val gmtId = Option("GMT")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index a514c902f1d1d..f1ad75321776d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -21,7 +21,6 @@ import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.time.{Instant, LocalDate, LocalDateTime, ZoneId, ZoneOffset}
 import java.util.{Calendar, Locale, TimeZone}
-import java.util.concurrent.TimeUnit
 import java.util.concurrent.TimeUnit._
 
 import org.apache.spark.SparkFunSuite
@@ -48,7 +47,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   def toMillis(timestamp: String): Long = {
     val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", ZoneOffset.UTC)
-    TimeUnit.MICROSECONDS.toMillis(tf.parse(timestamp))
+    DateTimeUtils.toMillis(tf.parse(timestamp))
   }
   val date = "2015-04-08 13:10:15"
   val d = new Date(toMillis(date))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
index d31a0e210552d..bc5743811c975 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
@@ -21,6 +21,7 @@ import scala.language.implicitConversions
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
 import org.apache.spark.sql.catalyst.util.IntervalUtils.{safeStringToInterval, stringToInterval}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.Decimal
@@ -260,7 +261,7 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         seconds: Int = 0,
         millis: Int = 0,
         micros: Int = 0): Unit = {
-      val secFrac = seconds * MICROS_PER_SECOND + millis * MICROS_PER_MILLIS + micros
+      val secFrac = DateTimeTestUtils.secFrac(seconds, millis, micros)
       val intervalExpr = MakeInterval(Literal(years), Literal(months), Literal(weeks),
         Literal(days), Literal(hours), Literal(minutes), Literal(Decimal(secFrac, 8, 6)))
       val totalMonths = years * MONTHS_PER_YEAR + months
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index df012ccf09620..e8beb612b5134 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, _}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last}
 import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, IntervalUtils}
-import org.apache.spark.sql.catalyst.util.DateTimeConstants._
 import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -681,13 +680,13 @@ class ExpressionParserSuite extends AnalysisTest {
       Literal(new CalendarInterval(
         0,
         0,
-        -13 * MICROS_PER_SECOND - 123 * MICROS_PER_MILLIS - 456)))
+        DateTimeTestUtils.secFrac(-13, -123, -456))))
     checkIntervals(
       "13.123456 second",
       Literal(new CalendarInterval(
         0,
         0,
-        13 * MICROS_PER_SECOND + 123 * MICROS_PER_MILLIS + 456)))
+        DateTimeTestUtils.secFrac(13, 123, 456))))
     checkIntervals("1.001 second",
       Literal(IntervalUtils.stringToInterval("1 second 1 millisecond")))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
index 8d16007a18f7f..5f1428f96e2b0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
@@ -21,6 +21,8 @@ import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId, ZoneOffset}
 import java.util.TimeZone
 import java.util.concurrent.TimeUnit
 
+import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+
 /**
  * Helper functions for testing date and time functionality.
  */
@@ -95,4 +97,11 @@ object DateTimeTestUtils {
     val localDateTime = LocalDateTime.of(localDate, localTime)
     localDateTimeToMicros(localDateTime, zid)
   }
+
+  def secFrac(seconds: Int, milliseconds: Int, microseconds: Int): Long = {
+    var result: Long = microseconds
+    result = Math.addExact(result, Math.multiplyExact(milliseconds, MICROS_PER_MILLIS))
+    result = Math.addExact(result, Math.multiplyExact(seconds, MICROS_PER_SECOND))
+    result
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 6ed18ca1bedab..2e689be01f338 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -89,8 +89,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
 
   test("SPARK-6785: java date conversion before and after epoch") {
     def format(d: Date): String = {
-      TimestampFormatter("uuuu-MM-dd", defaultTimeZone().toZoneId)
-        .format(d.getTime * MICROS_PER_MILLIS)
+      TimestampFormatter("uuuu-MM-dd", defaultTimeZone().toZoneId).format(fromMillis(d.getTime))
     }
     def checkFromToJavaDate(d1: Date): Unit = {
       val d2 = toJavaDate(fromJavaDate(d1))
@@ -584,15 +583,15 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
   }
 
   test("daysToMillis and millisToDays") {
-    val input = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, 16, zid = zonePST))
+    val input = toMillis(date(2015, 12, 31, 16, zid = zonePST))
     assert(millisToDays(input, zonePST) === 16800)
     assert(millisToDays(input, ZoneOffset.UTC) === 16801)
     assert(millisToDays(-1 * MILLIS_PER_DAY + 1, ZoneOffset.UTC) == -1)
 
-    var expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, zid = zonePST))
+    var expected = toMillis(date(2015, 12, 31, zid = zonePST))
     assert(daysToMillis(16800, zonePST) === expected)
 
-    expected = TimeUnit.MICROSECONDS.toMillis(date(2015, 12, 31, zid = zoneGMT))
+    expected = toMillis(date(2015, 12, 31, zid = zoneGMT))
     assert(daysToMillis(16800, ZoneOffset.UTC) === expected)
 
     // There are some days are skipped entirely in some timezone, skip them here.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
index 514804cbda16c..d0d79acd73883 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.fromMillis
 import org.apache.spark.sql.catalyst.util.IntervalUtils._
 import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit._
 import org.apache.spark.sql.internal.SQLConf
@@ -76,7 +77,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     testSingleUnit("HouR", 3, 0, 0, 3 * MICROS_PER_HOUR)
     testSingleUnit("MiNuTe", 3, 0, 0, 3 * MICROS_PER_MINUTE)
     testSingleUnit("Second", 3, 0, 0, 3 * MICROS_PER_SECOND)
-    testSingleUnit("MilliSecond", 3, 0, 0, 3 * MICROS_PER_MILLIS)
+    testSingleUnit("MilliSecond", 3, 0, 0, fromMillis(3))
     testSingleUnit("MicroSecond", 3, 0, 0, 3)
 
     checkFromInvalidString(null, "cannot be null")
@@ -175,7 +176,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
         new CalendarInterval(
           0,
           10,
-          12 * MICROS_PER_MINUTE + 888 * MICROS_PER_MILLIS))
+          12 * MICROS_PER_MINUTE + fromMillis(888)))
       assert(fromDayTimeString("-3 0:0:0") === new CalendarInterval(0, -3, 0L))
 
       try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
index eac5246904ffd..7dd52c1feabf2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
-import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.MetadataBuilder
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -100,7 +100,7 @@ case class EventTimeWatermarkExec(
     child.execute().mapPartitions { iter =>
       val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
       iter.map { row =>
-        eventTimeStats.add(getEventTime(row).getLong(0) / MICROS_PER_MILLIS)
+        eventTimeStats.add(toMillis(getEventTime(row).getLong(0)))
         row
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
index 1a27fe61d9602..d40208f02e7b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit
 import scala.concurrent.duration.Duration
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis
 import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.streaming.Trigger
 import org.apache.spark.unsafe.types.UTF8String
@@ -36,7 +37,8 @@ private object Triggers {
     if (cal.months != 0) {
       throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
     }
-    TimeUnit.MICROSECONDS.toMillis(cal.microseconds + cal.days * MICROS_PER_DAY)
+    val microsInDays = Math.multiplyExact(cal.days, MICROS_PER_DAY)
+    toMillis(Math.addExact(cal.microseconds, microsInDays))
   }
 
   def convert(interval: Duration): Long = interval.toMillis
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index fde8ddf491bd1..b6ea26ab95549 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql
 import java.{lang => jl}
 import java.io.File
 import java.sql.{Date, Timestamp}
-import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
 import scala.util.Random
@@ -30,6 +29,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatisti
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, HistogramSerializer, LogicalPlan}
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.test.SQLTestUtils
@@ -51,10 +51,10 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
   private val d2 = Date.valueOf(d2Str)
   private val t1Str = "2016-05-08 00:00:01.000000"
   private val t1Internal = date(2016, 5, 8, 0, 0, 1)
-  private val t1 = new Timestamp(TimeUnit.MICROSECONDS.toMillis(t1Internal))
+  private val t1 = new Timestamp(DateTimeUtils.toMillis(t1Internal))
   private val t2Str = "2016-05-09 00:00:02.000000"
   private val t2Internal = date(2016, 5, 9, 0, 0, 2)
-  private val t2 = new Timestamp(TimeUnit.MICROSECONDS.toMillis(t2Internal))
+  private val t2 = new Timestamp(DateTimeUtils.toMillis(t2Internal))
 
   /**
    * Define a very simple 3 row table used for testing column serialization.

From f4696ba252faf2ed32dca015abffefc8abad41bb Mon Sep 17 00:00:00 2001
From: jiake <ke.a.jia@intel.com>
Date: Mon, 24 Feb 2020 14:29:25 +0800
Subject: [PATCH 127/185] [SPARK-30922][SQL] remove the max splits config in
 skewed join

### What changes were proposed in this pull request?
When skewed join optimization split more skewed readers, the plan may be very large and can not be shown in ui quickly. The config `spark.sql.adaptive.skewedJoinOptimization.skewedPartitionMaxSplits`  is to resolve the above ui shown issue. And after [PR#27493](https://github.com/apache/spark/pull/27493) combined the skewed readers into one, we not need this config.

### Why are the changes needed?
remove the unnecessary config

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing test

Closes #27673 from JkSelf/removeMaxSplitNum.

Authored-by: jiake <ke.a.jia@intel.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   |  8 -----
 .../adaptive/OptimizeSkewedJoin.scala         |  8 ++---
 .../adaptive/AdaptiveQueryExecSuite.scala     | 31 ++++++++++---------
 3 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index faf9a3c7e1e39..4bef66d0d3aa9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -447,14 +447,6 @@ object SQLConf {
       .intConf
       .createWithDefault(10)
 
-  val ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS =
-    buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionMaxSplits")
-      .doc("Configures the maximum number of task to handle a skewed partition in adaptive skewed" +
-        "join.")
-      .intConf
-      .checkValue( _ >= 1, "The split size at least be 1")
-      .createWithDefault(5)
-
   val NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN =
     buildConf("spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin")
       .doc("The relation with a non-empty partition ratio lower than this config will not be " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index a716497c274b8..578d2d744f85c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -75,9 +75,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
   private def getMapStartIndices(stage: ShuffleQueryStageExec, partitionId: Int): Array[Int] = {
     val shuffleId = stage.shuffle.shuffleDependency.shuffleHandle.shuffleId
     val mapPartitionSizes = getMapSizesForReduceId(shuffleId, partitionId)
-    val maxSplits = math.min(conf.getConf(
-      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS), mapPartitionSizes.length)
-    val avgPartitionSize = mapPartitionSizes.sum / maxSplits
+    val avgPartitionSize = mapPartitionSizes.sum / mapPartitionSizes.length
     val advisoryPartitionSize = math.max(avgPartitionSize,
       conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD))
     val partitionStartIndices = ArrayBuffer[Int]()
@@ -95,9 +93,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
       i += 1
     }
 
-    if (partitionStartIndices.size > maxSplits) {
-      partitionStartIndices.take(maxSplits).toArray
-    } else partitionStartIndices.toArray
+    partitionStartIndices.toArray
   }
 
   private def getStatistics(stage: ShuffleQueryStageExec): MapOutputStatistics = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 4edb35ea30fde..379e17f01d6be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -609,7 +609,7 @@ class AdaptiveQueryExecSuite
     withSQLConf(
       SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100",
+      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "2000",
       SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") {
       withTempView("skewData1", "skewData2") {
         spark
@@ -636,14 +636,15 @@ class AdaptiveQueryExecSuite
           "SELECT * FROM skewData1 join skewData2 ON key1 = key2")
         // left stats: [3496, 0, 0, 0, 4014]
         // right stats:[6292, 0, 0, 0, 0]
-        // Partition 0: both left and right sides are skewed, and divide into 5 splits, so
-        //              5 x 5 sub-partitions.
+        // Partition 0: both left and right sides are skewed, left side is divided
+        //              into 2 splits and right side is divided into 4 splits, so
+        //              2 x 4 sub-partitions.
         // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
-        // Partition 4: only left side is skewed, and divide into 5 splits, so
-        //              5 sub-partitions.
-        // So total (25 + 1 + 5) partitions.
+        // Partition 4: only left side is skewed, and divide into 3 splits, so
+        //              3 sub-partitions.
+        // So total (8 + 1 + 3) partitions.
         val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan)
-        checkSkewJoin(innerSmj, 25 + 1 + 5)
+        checkSkewJoin(innerSmj, 8 + 1 + 3)
 
         // skewed left outer join optimization
         val (_, leftAdaptivePlan) = runAdaptiveAndVerifyResult(
@@ -651,13 +652,13 @@ class AdaptiveQueryExecSuite
         // left stats: [3496, 0, 0, 0, 4014]
         // right stats:[6292, 0, 0, 0, 0]
         // Partition 0: both left and right sides are skewed, but left join can't split right side,
-        //              so only left side is divided into 5 splits, and thus 5 sub-partitions.
+        //              so only left side is divided into 2 splits, and thus 2 sub-partitions.
         // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
-        // Partition 4: only left side is skewed, and divide into 5 splits, so
-        //              5 sub-partitions.
-        // So total (5 + 1 + 5) partitions.
+        // Partition 4: only left side is skewed, and divide into 3 splits, so
+        //              3 sub-partitions.
+        // So total (2 + 1 + 3) partitions.
         val leftSmj = findTopLevelSortMergeJoin(leftAdaptivePlan)
-        checkSkewJoin(leftSmj, 5 + 1 + 5)
+        checkSkewJoin(leftSmj, 2 + 1 + 3)
 
         // skewed right outer join optimization
         val (_, rightAdaptivePlan) = runAdaptiveAndVerifyResult(
@@ -665,13 +666,13 @@ class AdaptiveQueryExecSuite
         // left stats: [3496, 0, 0, 0, 4014]
         // right stats:[6292, 0, 0, 0, 0]
         // Partition 0: both left and right sides are skewed, but right join can't split left side,
-        //              so only right side is divided into 5 splits, and thus 5 sub-partitions.
+        //              so only right side is divided into 4 splits, and thus 4 sub-partitions.
         // Partition 1, 2, 3: not skewed, and coalesced into 1 partition.
         // Partition 4: only left side is skewed, but right join can't split left side, so just
         //              1 partition.
-        // So total (5 + 1 + 1) partitions.
+        // So total (4 + 1 + 1) partitions.
         val rightSmj = findTopLevelSortMergeJoin(rightAdaptivePlan)
-        checkSkewJoin(rightSmj, 5 + 1 + 1)
+        checkSkewJoin(rightSmj, 4 + 1 + 1)
       }
     }
   }

From 4ff2718d5448f212c0ede885c4072790c43bf826 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 24 Feb 2020 15:16:37 +0800
Subject: [PATCH 128/185] [SPARK-30924][SQL][3.0] Add additional checks to
 Merge Into

### What changes were proposed in this pull request?

Merge Into is currently missing additional validation around:

 1. The lack of any WHEN statements
 2. The first WHEN MATCHED statement needs to have a condition if there are two WHEN MATCHED statements.
 3. Single use of UPDATE/DELETE

This PR introduces these validations.
(1) is required, because otherwise the MERGE statement is useless.
(2) is required, because otherwise the second WHEN MATCHED condition becomes dead code
(3) is up for debate, but the idea there is that a single expression should be sufficient to specify when you would like to update or delete your records. We restrict it for now to reduce surface area and ambiguity.

### Why are the changes needed?

To ease DataSource developers when building implementations for MERGE

### Does this PR introduce any user-facing change?

Adds additional validation checks

### How was this patch tested?

Unit tests

Closes #27677 from brkyvz/mergeChecks.

Authored-by: Burak Yavuz <brkyvz@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/parser/AstBuilder.scala      | 12 +++++
 .../sql/catalyst/parser/DDLParserSuite.scala  | 50 +++++++++++++++++++
 .../command/PlanResolutionSuite.scala         | 10 ++--
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 3d7707c6bdf96..883128a8a2b06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -460,6 +460,18 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
         }
       }
     }
+    if (matchedActions.isEmpty && notMatchedActions.isEmpty) {
+      throw new ParseException("There must be at least one WHEN clause in a MERGE statement", ctx)
+    }
+    // children being empty means that the condition is not set
+    if (matchedActions.length == 2 && matchedActions.head.children.isEmpty) {
+      throw new ParseException("When there are 2 MATCHED clauses in a MERGE statement, " +
+        "the first MATCHED clause must have a condition", ctx)
+    }
+    if (matchedActions.groupBy(_.getClass).mapValues(_.size).exists(_._2 > 1)) {
+      throw new ParseException(
+        "UPDATE and DELETE can appear at most once in MATCHED clauses in a MERGE statement", ctx)
+    }
 
     MergeIntoTable(
       aliasedTarget,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
index 049f56c8c9ce1..29c9c1a44ef4c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -1190,6 +1190,56 @@ class DDLParserSuite extends AnalysisTest {
     assert(exc.getMessage.contains("There should be at most 1 'WHEN NOT MATCHED' clause."))
   }
 
+  test("merge into table: the first matched clause must have a condition if there's a second") {
+    val exc = intercept[ParseException] {
+      parsePlan(
+        """
+          |MERGE INTO testcat1.ns1.ns2.tbl AS target
+          |USING testcat2.ns1.ns2.tbl AS source
+          |ON target.col1 = source.col1
+          |WHEN MATCHED THEN DELETE
+          |WHEN MATCHED THEN UPDATE SET target.col2 = source.col2
+          |WHEN NOT MATCHED AND (target.col2='insert')
+          |THEN INSERT (target.col1, target.col2) values (source.col1, source.col2)
+        """.stripMargin)
+    }
+
+    assert(exc.getMessage.contains("the first MATCHED clause must have a condition"))
+  }
+
+  test("merge into table: there must be a when (not) matched condition") {
+    val exc = intercept[ParseException] {
+      parsePlan(
+        """
+          |MERGE INTO testcat1.ns1.ns2.tbl AS target
+          |USING testcat2.ns1.ns2.tbl AS source
+          |ON target.col1 = source.col1
+        """.stripMargin)
+    }
+
+    assert(exc.getMessage.contains("There must be at least one WHEN clause in a MERGE statement"))
+  }
+
+  test("merge into table: there can be only a single use DELETE or UPDATE") {
+    Seq("UPDATE SET *", "DELETE").foreach { op =>
+      val exc = intercept[ParseException] {
+        parsePlan(
+          s"""
+             |MERGE INTO testcat1.ns1.ns2.tbl AS target
+             |USING testcat2.ns1.ns2.tbl AS source
+             |ON target.col1 = source.col1
+             |WHEN MATCHED AND (target.col2='delete') THEN $op
+             |WHEN MATCHED THEN $op
+             |WHEN NOT MATCHED AND (target.col2='insert')
+             |THEN INSERT (target.col1, target.col2) values (source.col1, source.col2)
+           """.stripMargin)
+      }
+
+      assert(exc.getMessage.contains(
+        "UPDATE and DELETE can appear at most once in MATCHED clauses"))
+    }
+  }
+
   test("show tables") {
     comparePlans(
       parsePlan("SHOW TABLES"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
index 21c698c4a2c4d..01d16ce0bf285 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala
@@ -1277,7 +1277,7 @@ class PlanResolutionSuite extends AnalysisTest {
              |MERGE INTO $target AS target
              |USING $source AS source
              |ON target.i = source.i
-             |WHEN MATCHED THEN DELETE
+             |WHEN MATCHED AND (target.s='delete') THEN DELETE
              |WHEN MATCHED THEN UPDATE SET target.s = source.s
              |WHEN NOT MATCHED THEN INSERT (target.i, target.s) values (source.i, source.s)
            """.stripMargin
@@ -1286,7 +1286,7 @@ class PlanResolutionSuite extends AnalysisTest {
               SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)),
               SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)),
               mergeCondition,
-              Seq(DeleteAction(None), UpdateAction(None, updateAssigns)),
+              Seq(DeleteAction(Some(_)), UpdateAction(None, updateAssigns)),
               Seq(InsertAction(None, insertAssigns))) =>
             checkResolution(target, source, mergeCondition, None, None, None,
               updateAssigns, insertAssigns)
@@ -1364,7 +1364,7 @@ class PlanResolutionSuite extends AnalysisTest {
            |MERGE INTO $target
            |USING $source
            |ON 1 = 1
-           |WHEN MATCHED THEN DELETE
+           |WHEN MATCHED AND (${target}.s='delete') THEN DELETE
            |WHEN MATCHED THEN UPDATE SET s = 1
            |WHEN NOT MATCHED AND (s = 'a') THEN INSERT (i) values (i)
          """.stripMargin
@@ -1374,7 +1374,7 @@ class PlanResolutionSuite extends AnalysisTest {
             AsDataSourceV2Relation(target),
             AsDataSourceV2Relation(source),
             _,
-            Seq(DeleteAction(None), UpdateAction(None, updateAssigns)),
+            Seq(DeleteAction(Some(_)), UpdateAction(None, updateAssigns)),
             Seq(InsertAction(
               Some(EqualTo(il: AttributeReference, StringLiteral("a"))),
               insertAssigns))) =>
@@ -1451,7 +1451,7 @@ class PlanResolutionSuite extends AnalysisTest {
          |MERGE INTO non_exist_target
          |USING non_exist_source
          |ON target.i = source.i
-         |WHEN MATCHED THEN DELETE
+         |WHEN MATCHED AND (non_exist_target.s='delete') THEN DELETE
          |WHEN MATCHED THEN UPDATE SET *
          |WHEN NOT MATCHED THEN INSERT *
        """.stripMargin

From 31bfbaeb90de825645cd77a43a1af84816967272 Mon Sep 17 00:00:00 2001
From: lijunqing <lijunqing@baidu.com>
Date: Mon, 24 Feb 2020 15:42:45 +0800
Subject: [PATCH 129/185] [SPARK-30868][SQL] Throw Exception if runHive(sql)
 failed

### Why are the changes needed?
At present, HiveClientImpl.runHive will not throw an exception when it runs incorrectly, which will cause it to fail to feedback error information normally.
Example
```scala
spark.sql("add jar file:///tmp/not_exists.jar")
spark.sql("show databases").show()
```
/tmp/not_exists.jar doesn't exist, thus add jar is failed. However this code will run completely without causing application failure.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
add new suite tests

Closes #27644 from stczwd/SPARK-30868.

Authored-by: lijunqing <lijunqing@baidu.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/hive/client/HiveClientImpl.scala      |  7 +++++-
 .../sql/hive/HiveExternalCatalogSuite.scala   | 24 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index ab4db0cf36242..f2c516ec0a8dc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -839,7 +839,12 @@ private[hive] class HiveClientImpl(
             state.out.println(tokens(0) + " " + cmd_1)
             // scalastyle:on println
           }
-          Seq(proc.run(cmd_1).getResponseCode.toString)
+          val response: CommandProcessorResponse = proc.run(cmd_1)
+          // Throw an exception if there is an error in query processing.
+          if (response.getResponseCode != 0) {
+            throw new QueryExecutionException(response.getErrorMessage)
+          }
+          Seq(response.getResponseCode.toString)
       }
     } catch {
       case e: Exception =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index 79e569b51ca1d..0a8889885df7b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.types.StructType
 
@@ -154,4 +155,27 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
     catalog.createTable(hiveTable, ignoreIfExists = false)
     assert(catalog.getTable("db1", "spark_29498").owner === owner)
   }
+
+  test("SPARK-30868 throw an exception if HiveClient#runSqlHive fails") {
+    val client = externalCatalog.client
+    // test add jars which doesn't exists
+    val jarPath = "file:///tmp/not_exists.jar"
+    assertThrows[QueryExecutionException](client.runSqlHive(s"ADD JAR $jarPath"))
+
+    // test change to the database which doesn't exists
+    assertThrows[QueryExecutionException](client.runSqlHive(
+      s"use db_not_exists"))
+
+    // test create hive table failed with unsupported into type
+    assertThrows[QueryExecutionException](client.runSqlHive(
+      s"CREATE TABLE t(n into)"))
+
+    // test desc table failed with wrong `FORMATED` keyword
+    assertThrows[QueryExecutionException](client.runSqlHive(
+      s"DESC FORMATED t"))
+
+    // test wrong insert query
+    assertThrows[QueryExecutionException](client.runSqlHive(
+      "INSERT overwrite directory \"fs://localhost/tmp\" select 1 as a"))
+  }
 }

From 612f63f39ef91a06b71d40e20ad0216cb88776a1 Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Mon, 24 Feb 2020 16:47:08 +0900
Subject: [PATCH 130/185] [SPARK-30897][SQL] The behavior of ArrayExists should
 not depend on SQLConf.get

### What changes were proposed in this pull request?
This PR adds a new `followThreeValuedLogic` parameter to `ArrayExists` so as to avoid its value depending on `SQLConf.get` and change during planning.

### Why are the changes needed?
This allows to avoid the issue when the configuration change between different phases of planning, and this can silently break a query plan which can lead to crashes or data corruption.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing UTs.

Closes #27655 from peter-toth/SPARK-30897.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../expressions/higherOrderFunctions.scala      | 17 ++++++++++++++---
 .../ReplaceNullWithFalseInPredicate.scala       |  3 +--
 .../ReplaceNullWithFalseInPredicateSuite.scala  |  2 +-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
index f8142d6b993cb..9dd4263fdeae3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
@@ -522,11 +522,16 @@ case class ArrayFilter(
   since = "2.4.0")
 case class ArrayExists(
     argument: Expression,
-    function: Expression)
+    function: Expression,
+    followThreeValuedLogic: Boolean)
   extends ArrayBasedSimpleHigherOrderFunction with CodegenFallback {
 
-  private val followThreeValuedLogic =
-    SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC)
+  def this(argument: Expression, function: Expression) = {
+    this(
+      argument,
+      function,
+      SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC))
+  }
 
   override def nullable: Boolean =
     if (followThreeValuedLogic) {
@@ -574,6 +579,12 @@ case class ArrayExists(
   override def prettyName: String = "exists"
 }
 
+object ArrayExists {
+  def apply(argument: Expression, function: Expression): ArrayExists = {
+    new ArrayExists(argument, function)
+  }
+}
+
 /**
  * Tests whether a predicate holds for all elements in the array.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala
index b8edf985dbda3..33b398e11cde9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala
@@ -64,8 +64,7 @@ object ReplaceNullWithFalseInPredicate extends Rule[LogicalPlan] {
       case af @ ArrayFilter(_, lf @ LambdaFunction(func, _, _)) =>
         val newLambda = lf.copy(function = replaceNullWithFalse(func))
         af.copy(function = newLambda)
-      case ae @ ArrayExists(_, lf @ LambdaFunction(func, _, _))
-          if !SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC) =>
+      case ae @ ArrayExists(_, lf @ LambdaFunction(func, _, _), false) =>
         val newLambda = lf.copy(function = replaceNullWithFalse(func))
         ae.copy(function = newLambda)
       case mf @ MapFilter(_, lf @ LambdaFunction(func, _, _)) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala
index b692c3fee53c7..c7f42f08406e7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala
@@ -324,7 +324,7 @@ class ReplaceNullWithFalseInPredicateSuite extends PlanTest {
       testProjection(originalExpr = expr, expectedExpr = expr)
     }
     withSQLConf(SQLConf.LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC.key -> "false") {
-      testHigherOrderFunc('a, ArrayExists, Seq(lv('e)))
+      testHigherOrderFunc('a, ArrayExists.apply, Seq(lv('e)))
     }
   }
 

From a372f76cbdfe1fb6b87f0ad7b1ced22604406f1a Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Mon, 24 Feb 2020 16:48:48 +0900
Subject: [PATCH 131/185] [SPARK-30898][SQL] The behavior of MakeDecimal should
 not depend on SQLConf.get

### What changes were proposed in this pull request?
This PR adds a new `nullOnOverflow` parameter to `MakeDecimal` so as to avoid its value depending on `SQLConf.get` and change during planning.

### Why are the changes needed?
This allows to avoid the issue when the configuration change between different phases of planning, and this can silently break a query plan which can lead to crashes or data corruption.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing UTs.

Closes #27656 from peter-toth/SPARK-30898.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../expressions/decimalExpressions.scala         | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
index 7b2489e682661..9014ebfe2f96a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -45,9 +45,15 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
  * Note: this expression is internal and created only by the optimizer,
  * we don't need to do type check for it.
  */
-case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends UnaryExpression {
+case class MakeDecimal(
+    child: Expression,
+    precision: Int,
+    scale: Int,
+    nullOnOverflow: Boolean) extends UnaryExpression {
 
-  private val nullOnOverflow = !SQLConf.get.ansiEnabled
+  def this(child: Expression, precision: Int, scale: Int) = {
+    this(child, precision, scale, !SQLConf.get.ansiEnabled)
+  }
 
   override def dataType: DataType = DecimalType(precision, scale)
   override def nullable: Boolean = child.nullable || nullOnOverflow
@@ -83,6 +89,12 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   }
 }
 
+object MakeDecimal {
+  def apply(child: Expression, precision: Int, scale: Int): MakeDecimal = {
+    new MakeDecimal(child, precision, scale)
+  }
+}
+
 /**
  * An expression used to wrap the children when promote the precision of DecimalType to avoid
  * promote multiple times.

From 621e37e2ab44255ce0860cedd05a028dd7e8fd59 Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Mon, 24 Feb 2020 00:28:46 -0800
Subject: [PATCH 132/185] [SPARK-28880][SQL] Support ANSI nested bracketed
 comments

### What changes were proposed in this pull request?
Spark SQL support single comments and bracketed comments now. This PR will support nested bracketed comments.

There are some mainstream database support the syntax.
**PostgreSQL:**
https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-COMMENTS

**Vertica:**
https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/LanguageElements/Expressions/Comments.htm?zoom_highlight=comments

Note: Because Spark SQL not exists UT for single comments and bracketed comments, so I add some UT for them.

### Why are the changes needed?
nested bracketed comments is ANSI standard.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
New UT

Closes #27495 from beliefer/nested-brancket-comments.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  22 +++-
 .../sql/catalyst/parser/PlanParserSuite.scala |  98 ++++++++++++++
 .../resources/sql-tests/inputs/comments.sql   |  90 +++++++++++++
 .../sql-tests/inputs/postgreSQL/comments.sql  |   5 +-
 .../sql-tests/results/comments.sql.out        | 121 ++++++++++++++++++
 .../results/postgreSQL/comments.sql.out       |  47 +------
 6 files changed, 331 insertions(+), 52 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/comments.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/comments.sql.out

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 2bc71476aba02..ee14a01c64777 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -61,6 +61,22 @@ grammar SqlBase;
    * When true, the behavior of keywords follows ANSI SQL standard.
    */
   public boolean SQL_standard_keyword_behavior = false;
+
+  /**
+   * This method will be called when we see '/*' and try to match it as a bracketed comment.
+   * If the next character is '+', it should be parsed as hint later, and we cannot match
+   * it as a bracketed comment.
+   *
+   * Returns true if the next character is '+'.
+   */
+  public boolean isHint() {
+    int nextChar = _input.LA(1);
+    if (nextChar == '+') {
+      return true;
+    } else {
+      return false;
+    }
+  }
 }
 
 singleStatement
@@ -1796,12 +1812,8 @@ SIMPLE_COMMENT
     : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN)
     ;
 
-BRACKETED_EMPTY_COMMENT
-    : '/**/' -> channel(HIDDEN)
-    ;
-
 BRACKETED_COMMENT
-    : '/*' ~[+] .*? '*/' -> channel(HIDDEN)
+    : '/*' {!isHint()}? (BRACKETED_COMMENT|.)*? '*/' -> channel(HIDDEN)
     ;
 
 WS
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 875096f615241..7382ef64d0fc4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -55,6 +55,104 @@ class PlanParserSuite extends AnalysisTest {
     With(plan, ctes)
   }
 
+  test("single comment") {
+    val plan = table("a").select(star())
+    assertEqual("-- single comment\nSELECT * FROM a", plan)
+  }
+
+  test("bracketed comment case one") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/* This is an example of SQL which should not execute:
+        | * select 'multi-line';
+        | */
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("bracketed comment case two") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/*
+        |SELECT 'trailing' as x1; -- inside block comment
+        |*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case one") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/* This block comment surrounds a query which itself has a block comment...
+        |SELECT /* embedded single line */ 'embedded' AS x2;
+        |*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case two") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |SELECT -- continued after the following block comments...
+        |/* Deeply nested comment.
+        |   This includes a single apostrophe to make sure we aren't decoding this part as a string.
+        |SELECT 'deep nest' AS n1;
+        |/* Second level of nesting...
+        |SELECT 'deeper nest' as n2;
+        |/* Third level of nesting...
+        |SELECT 'deepest nest' as n3;
+        |*/
+        |Hoo boy. Still two deep...
+        |*/
+        |Now just one deep...
+        |*/
+        |* FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case three") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/* This block comment surrounds a query which itself has a block comment...
+        |//* I am a nested bracketed comment.
+        |*/
+        |*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case four") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/*/**/*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case five") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/*/*abc*/*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
+  test("nested bracketed comment case six") {
+    val plan = table("a").select(star())
+    assertEqual(
+      """
+        |/*/*foo*//*bar*/*/
+        |SELECT * FROM a
+      """.stripMargin, plan)
+  }
+
   test("case insensitive") {
     val plan = table("a").select(star())
     assertEqual("sELEct * FroM a", plan)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/comments.sql
new file mode 100644
index 0000000000000..19f11de22dfd1
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/comments.sql
@@ -0,0 +1,90 @@
+-- Test comments.
+
+-- the first case of bracketed comment
+--QUERY-DELIMITER-START
+/* This is the first example of bracketed comment.
+SELECT 'ommented out content' AS first;
+*/
+SELECT 'selected content' AS first;
+--QUERY-DELIMITER-END
+
+-- the second case of bracketed comment
+--QUERY-DELIMITER-START
+/* This is the second example of bracketed comment.
+SELECT '/', 'ommented out content' AS second;
+*/
+SELECT '/', 'selected content' AS second;
+--QUERY-DELIMITER-END
+
+-- the third case of bracketed comment
+--QUERY-DELIMITER-START
+/* This is the third example of bracketed comment.
+ *SELECT '*', 'ommented out content' AS third;
+ */
+SELECT '*', 'selected content' AS third;
+--QUERY-DELIMITER-END
+
+-- the first case of empty bracketed comment
+--QUERY-DELIMITER-START
+/**/
+SELECT 'selected content' AS fourth;
+--QUERY-DELIMITER-END
+
+-- the first case of nested bracketed comment
+--QUERY-DELIMITER-START
+/* This is the first example of nested bracketed comment.
+/* I am a nested bracketed comment.*/
+*/
+SELECT 'selected content' AS fifth;
+--QUERY-DELIMITER-END
+
+-- the second case of nested bracketed comment
+--QUERY-DELIMITER-START
+/* This is the second example of nested bracketed comment.
+/* I am a nested bracketed comment.
+ */
+ */
+SELECT 'selected content' AS sixth;
+--QUERY-DELIMITER-END
+
+-- the third case of nested bracketed comment
+--QUERY-DELIMITER-START
+/*
+ * This is the third example of nested bracketed comment.
+  /*
+   * I am a nested bracketed comment.
+   */
+ */
+SELECT 'selected content' AS seventh;
+--QUERY-DELIMITER-END
+
+-- the fourth case of nested bracketed comment
+--QUERY-DELIMITER-START
+/* 
+ * This is the fourth example of nested bracketed comment.
+SELECT /* I am a nested bracketed comment.*/ * FROM testData;
+ */
+SELECT 'selected content' AS eighth;
+--QUERY-DELIMITER-END
+
+-- the fifth case of nested bracketed comment
+--QUERY-DELIMITER-START
+SELECT /*
+ * This is the fifth example of nested bracketed comment.
+/* I am a second level of nested bracketed comment.
+/* I am a third level of nested bracketed comment.
+Other information of third level.
+SELECT 'ommented out content' AS ninth;
+*/
+Other information of second level.
+*/
+Other information of first level.
+*/
+'selected content' AS ninth;
+--QUERY-DELIMITER-END
+
+-- the first case of empty nested bracketed comment
+--QUERY-DELIMITER-START
+/*/**/*/
+SELECT 'selected content' AS tenth;
+--QUERY-DELIMITER-END
diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
index 1a454179ef79f..b4614bf2e4693 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql
@@ -12,14 +12,12 @@ SELECT /* both embedded and trailing single line */ 'both' AS third; -- trailing
 
 SELECT 'before multi-line' AS fourth;
 --QUERY-DELIMITER-START
--- [SPARK-28880] ANSI SQL: Bracketed comments
 /* This is an example of SQL which should not execute:
  * select 'multi-line';
  */
 SELECT 'after multi-line' AS fifth;
 --QUERY-DELIMITER-END
 
--- [SPARK-28880] ANSI SQL: Bracketed comments
 --
 -- Nested comments
 --
@@ -47,4 +45,5 @@ Now just one deep...
 */
 'deeply nested example' AS sixth;
 --QUERY-DELIMITER-END
-/* and this is the end of the file */
+-- [SPARK-30824] Support submit sql content only contains comments.
+-- /* and this is the end of the file */
diff --git a/sql/core/src/test/resources/sql-tests/results/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/comments.sql.out
new file mode 100644
index 0000000000000..fd58a33595fe6
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/comments.sql.out
@@ -0,0 +1,121 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 10
+
+
+-- !query
+/* This is the first example of bracketed comment.
+SELECT 'ommented out content' AS first;
+*/
+SELECT 'selected content' AS first
+-- !query schema
+struct<first:string>
+-- !query output
+selected content
+
+
+-- !query
+/* This is the second example of bracketed comment.
+SELECT '/', 'ommented out content' AS second;
+*/
+SELECT '/', 'selected content' AS second
+-- !query schema
+struct</:string,second:string>
+-- !query output
+/	selected content
+
+
+-- !query
+/* This is the third example of bracketed comment.
+ *SELECT '*', 'ommented out content' AS third;
+ */
+SELECT '*', 'selected content' AS third
+-- !query schema
+struct<*:string,third:string>
+-- !query output
+*	selected content
+
+
+-- !query
+/**/
+SELECT 'selected content' AS fourth
+-- !query schema
+struct<fourth:string>
+-- !query output
+selected content
+
+
+-- !query
+/* This is the first example of nested bracketed comment.
+/* I am a nested bracketed comment.*/
+*/
+SELECT 'selected content' AS fifth
+-- !query schema
+struct<fifth:string>
+-- !query output
+selected content
+
+
+-- !query
+/* This is the second example of nested bracketed comment.
+/* I am a nested bracketed comment.
+ */
+ */
+SELECT 'selected content' AS sixth
+-- !query schema
+struct<sixth:string>
+-- !query output
+selected content
+
+
+-- !query
+/*
+ * This is the third example of nested bracketed comment.
+  /*
+   * I am a nested bracketed comment.
+   */
+ */
+SELECT 'selected content' AS seventh
+-- !query schema
+struct<seventh:string>
+-- !query output
+selected content
+
+
+-- !query
+/* 
+ * This is the fourth example of nested bracketed comment.
+SELECT /* I am a nested bracketed comment.*/ * FROM testData;
+ */
+SELECT 'selected content' AS eighth
+-- !query schema
+struct<eighth:string>
+-- !query output
+selected content
+
+
+-- !query
+SELECT /*
+ * This is the fifth example of nested bracketed comment.
+/* I am a second level of nested bracketed comment.
+/* I am a third level of nested bracketed comment.
+Other information of third level.
+SELECT 'ommented out content' AS ninth;
+*/
+Other information of second level.
+*/
+Other information of first level.
+*/
+'selected content' AS ninth
+-- !query schema
+struct<ninth:string>
+-- !query output
+selected content
+
+
+-- !query
+/*/**/*/
+SELECT 'selected content' AS tenth
+-- !query schema
+struct<tenth:string>
+-- !query output
+selected content
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
index 637c5561bd940..d583a4a5d420a 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 7
+-- Number of queries: 6
 
 
 -- !query
@@ -69,47 +69,6 @@ Now just one deep...
 */
 'deeply nested example' AS sixth
 -- !query schema
-struct<>
+struct<sixth:string>
 -- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input ''embedded'' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 6, pos 34)
-
-== SQL ==
-/*
-SELECT 'trailing' as x1; -- inside block comment
-*/
-
-/* This block comment surrounds a query which itself has a block comment...
-SELECT /* embedded single line */ 'embedded' AS x2;
-----------------------------------^^^
-*/
-
-SELECT -- continued after the following block comments...
-/* Deeply nested comment.
-   This includes a single apostrophe to make sure we aren't decoding this part as a string.
-SELECT 'deep nest' AS n1;
-/* Second level of nesting...
-SELECT 'deeper nest' as n2;
-/* Third level of nesting...
-SELECT 'deepest nest' as n3;
-*/
-Hoo boy. Still two deep...
-*/
-Now just one deep...
-*/
-'deeply nested example' AS sixth
-
-
--- !query
-/* and this is the end of the file */
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-mismatched input '<EOF>' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 37)
-
-== SQL ==
-/* and this is the end of the file */
--------------------------------------^^^
+deeply nested example

From 293e5364e570e8a6569e47e58918db2f1d45ddff Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Mon, 24 Feb 2020 18:48:19 +0900
Subject: [PATCH 133/185] [SPARK-30927][SS] StreamingQueryManager should avoid
 keeping reference to terminated StreamingQuery

### What changes were proposed in this pull request?

Right now `StreamingQueryManager` will keep the last terminated query until `resetTerminated` is called. When the last terminated query has lots of states (a large sql plan, cached RDDs, etc.), it will keep a lot of memory unnecessarily. Actually, what `StreamingQueryManager` really needs is just the exception of the last failed query.

This PR changes the internal field `lastTerminatedQuery` in `StreamingQueryManager` to remember the last exception rather than the query to save the memory.

### Why are the changes needed?

Avoid keeping memory unnecessarily.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

This PR doesn't change any public behaviors. The existing tests have covered the touched codes.

Closes #27678 from zsxwing/SPARK-30927.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../sql/streaming/StreamingQueryManager.scala | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 4d0d8ffd959c6..0fe2d0be966d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -58,8 +58,17 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
   private val activeQueriesSharedLock = sparkSession.sharedState.activeQueriesLock
   private val awaitTerminationLock = new Object
 
+  /**
+   * Track the last terminated query and remember the last failure since the creation of the
+   * context, or since `resetTerminated()` was called. There are three possible values:
+   *
+   * - null: no query has been been terminated.
+   * - None: some queries have been terminated and no one has failed.
+   * - Some(StreamingQueryException): Some queries have been terminated and at least one query has
+   *   failed. The exception is the exception of the last failed query.
+   */
   @GuardedBy("awaitTerminationLock")
-  private var lastTerminatedQuery: StreamingQuery = null
+  private var lastTerminatedQueryException: Option[StreamingQueryException] = null
 
   try {
     sparkSession.sparkContext.conf.get(STREAMING_QUERY_LISTENERS).foreach { classNames =>
@@ -125,11 +134,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
   @throws[StreamingQueryException]
   def awaitAnyTermination(): Unit = {
     awaitTerminationLock.synchronized {
-      while (lastTerminatedQuery == null) {
+      while (lastTerminatedQueryException == null) {
         awaitTerminationLock.wait(10)
       }
-      if (lastTerminatedQuery != null && lastTerminatedQuery.exception.nonEmpty) {
-        throw lastTerminatedQuery.exception.get
+      if (lastTerminatedQueryException != null && lastTerminatedQueryException.nonEmpty) {
+        throw lastTerminatedQueryException.get
       }
     }
   }
@@ -164,13 +173,13 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
     }
 
     awaitTerminationLock.synchronized {
-      while (!isTimedout && lastTerminatedQuery == null) {
+      while (!isTimedout && lastTerminatedQueryException == null) {
         awaitTerminationLock.wait(10)
       }
-      if (lastTerminatedQuery != null && lastTerminatedQuery.exception.nonEmpty) {
-        throw lastTerminatedQuery.exception.get
+      if (lastTerminatedQueryException != null && lastTerminatedQueryException.nonEmpty) {
+        throw lastTerminatedQueryException.get
       }
-      lastTerminatedQuery != null
+      lastTerminatedQueryException != null
     }
   }
 
@@ -182,7 +191,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
    */
   def resetTerminated(): Unit = {
     awaitTerminationLock.synchronized {
-      lastTerminatedQuery = null
+      lastTerminatedQueryException = null
     }
   }
 
@@ -422,8 +431,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
   private[sql] def notifyQueryTermination(terminatedQuery: StreamingQuery): Unit = {
     unregisterTerminatedStream(terminatedQuery)
     awaitTerminationLock.synchronized {
-      if (lastTerminatedQuery == null || terminatedQuery.exception.nonEmpty) {
-        lastTerminatedQuery = terminatedQuery
+      if (lastTerminatedQueryException == null || terminatedQuery.exception.nonEmpty) {
+        lastTerminatedQueryException = terminatedQuery.exception
       }
       awaitTerminationLock.notifyAll()
     }

From 1a4e2423b284d8382fcf9547d4351b20d7a2be56 Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Mon, 24 Feb 2020 13:46:21 -0800
Subject: [PATCH 134/185] [SPARK-30870][SQL] Column pruning shouldn't alias a
 nested column if it means the whole structure

### What changes were proposed in this pull request?
This PR fixes a bug in nested column aliasing by taking the data type of the referenced nested fields into account when calculating the number of extracted columns. After this PR this query runs without issues:
```
SELECT explodedvalue.*
FROM VALUES array(named_struct('nested', named_struct('a', 1, 'b', 2))) AS (value)
LATERAL VIEW explode(value) AS explodedvalue
```
This is a regression from Spark 2.4.

### Why are the changes needed?
To fix a bug.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Added new UT.

Closes #27675 from peter-toth/SPARK-30870.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../sql/catalyst/optimizer/NestedColumnAliasing.scala |  4 +++-
 .../optimizer/NestedColumnAliasingSuite.scala         |  7 +------
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala    | 11 +++++++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index 43a6006f9b5c0..46e07f3843925 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -129,7 +129,9 @@ object NestedColumnAliasing {
         // If all nested fields of `attr` are used, we don't need to introduce new aliases.
         // By default, ColumnPruning rule uses `attr` already.
         if (nestedFieldToAlias.nonEmpty &&
-            nestedFieldToAlias.length < totalFieldNum(attr.dataType)) {
+            nestedFieldToAlias
+              .map { case (nestedField, _) => totalFieldNum(nestedField.dataType) }
+              .sum < totalFieldNum(attr.dataType)) {
           Some(attr.exprId -> nestedFieldToAlias)
         } else {
           None
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
index 2351d8321c5f3..a9267379e12fd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
@@ -215,12 +215,7 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
 
     val optimized = Optimize.execute(query)
 
-    val expected = nestedRelation
-      .select(GetStructField('a, 0, Some("b")))
-      .limit(5)
-      .analyze
-
-    comparePlans(optimized, expected)
+    comparePlans(optimized, query)
   }
 
   test("nested field pruning for getting struct field in array of struct") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index d6efb2f91ca0b..c87e6a4e39b10 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3393,6 +3393,17 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       )
     }
   }
+
+  test("SPARK-30870: Column pruning shouldn't alias a nested column if it means the whole " +
+    "structure") {
+    val df = sql(
+      """
+        |SELECT explodedvalue.field
+        |FROM VALUES array(named_struct('field', named_struct('a', 1, 'b', 2))) AS (value)
+        |LATERAL VIEW explode(value) AS explodedvalue
+      """.stripMargin)
+    checkAnswer(df, Row(Row(1, 2)) :: Nil)
+  }
 }
 
 case class Foo(bar: Option[String])

From 10fa71321f607cb35c33d9b574595b27efcb7633 Mon Sep 17 00:00:00 2001
From: XU Duo <Duo.XU@canal-plus.com>
Date: Mon, 24 Feb 2020 20:16:00 -0600
Subject: [PATCH 135/185] [SPARK-30901][DOCS] Fix doc exemple with deprecated
 codes

### What changes were proposed in this pull request?

Previous exemple given for spark-streaming-kinesis was true for Apache Spark < 2.3.0. After that the method used in exemple became deprecated:
deprecated("use initialPosition(initialPosition: KinesisInitialPosition)", "2.3.0")
def initialPositionInStream(initialPosition: InitialPositionInStream)

This PR updates the doc on rewriting exemple in Scala/Java (remain unchanged in Python) to adapt Apache Spark 2.4.0 + releases.

### Why are the changes needed?

It introduces some confusion for developers to test their spark-streaming-kinesis exemple.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

In my opinion, the change is only about the documentation level, so I did not add any special test.

Closes #27652 from supaggregator/SPARK-30901.

Authored-by: XU Duo <Duo.XU@canal-plus.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 docs/streaming-kinesis-integration.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index c81597160ca09..e68d51321cae3 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -46,14 +46,14 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
             import org.apache.spark.storage.StorageLevel
             import org.apache.spark.streaming.kinesis.KinesisInputDStream
             import org.apache.spark.streaming.{Seconds, StreamingContext}
-            import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+            import org.apache.spark.streaming.kinesis.KinesisInitialPositions
 
             val kinesisStream = KinesisInputDStream.builder
                 .streamingContext(streamingContext)
                 .endpointUrl([endpoint URL])
                 .regionName([region name])
                 .streamName([streamName])
-                .initialPositionInStream([initial position])
+                .initialPosition([initial position])
                 .checkpointAppName([Kinesis app name])
                 .checkpointInterval([checkpoint interval])
                 .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
@@ -68,14 +68,14 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
             import org.apache.spark.streaming.kinesis.KinesisInputDStream;
             import org.apache.spark.streaming.Seconds;
             import org.apache.spark.streaming.StreamingContext;
-            import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+            import org.apache.spark.streaming.kinesis.KinesisInitialPositions;
 
             KinesisInputDStream<byte[]> kinesisStream = KinesisInputDStream.builder()
                 .streamingContext(streamingContext)
                 .endpointUrl([endpoint URL])
                 .regionName([region name])
                 .streamName([streamName])
-                .initialPositionInStream([initial position])
+                .initialPosition([initial position])
                 .checkpointAppName([Kinesis app name])
                 .checkpointInterval([checkpoint interval])
                 .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
@@ -110,7 +110,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
                 import org.apache.spark.storage.StorageLevel
                 import org.apache.spark.streaming.kinesis.KinesisInputDStream
                 import org.apache.spark.streaming.{Seconds, StreamingContext}
-                import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+                import org.apache.spark.streaming.kinesis.KinesisInitialPositions
                 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration
                 import com.amazonaws.services.kinesis.metrics.interfaces.MetricsLevel
 
@@ -119,7 +119,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
                     .endpointUrl([endpoint URL])
                     .regionName([region name])
                     .streamName([streamName])
-                    .initialPositionInStream([initial position])
+                    .initialPosition([initial position])
                     .checkpointAppName([Kinesis app name])
                     .checkpointInterval([checkpoint interval])
                     .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
@@ -133,7 +133,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
                 import org.apache.spark.streaming.kinesis.KinesisInputDStream;
                 import org.apache.spark.streaming.Seconds;
                 import org.apache.spark.streaming.StreamingContext;
-                import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+                import org.apache.spark.streaming.kinesis.KinesisInitialPositions;
                 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration;
                 import com.amazonaws.services.kinesis.metrics.interfaces.MetricsLevel;
                 import scala.collection.JavaConverters;
@@ -143,7 +143,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
                     .endpointUrl([endpoint URL])
                     .regionName([region name])
                     .streamName([streamName])
-                    .initialPositionInStream([initial position])
+                    .initialPosition([initial position])
                     .checkpointAppName([Kinesis app name])
                     .checkpointInterval([checkpoint interval])
                     .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
@@ -170,7 +170,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 
 	- `[checkpoint interval]`: The interval (e.g., Duration(2000) = 2 seconds) at which the Kinesis Client Library saves its position in the stream.  For starters, set it to the same as the batch interval of the streaming application.
 
-	- `[initial position]`: Can be either `InitialPositionInStream.TRIM_HORIZON` or `InitialPositionInStream.LATEST` (see [`Kinesis Checkpointing`](#kinesis-checkpointing) section and [`Amazon Kinesis API documentation`](http://docs.aws.amazon.com/streams/latest/dev/developing-consumers-with-sdk.html) for more details).
+	- `[initial position]`: Can be either `KinesisInitialPositions.TrimHorizon` or `KinesisInitialPositions.Latest` or `KinesisInitialPositions.AtTimestamp` (see [`Kinesis Checkpointing`](#kinesis-checkpointing) section and [`Amazon Kinesis API documentation`](http://docs.aws.amazon.com/streams/latest/dev/developing-consumers-with-sdk.html) for more details).
 
 	- `[message handler]`: A function that takes a Kinesis `Record` and outputs generic `T`.
 
@@ -272,9 +272,9 @@ de-aggregate records during consumption.
 
 - Checkpointing too frequently will cause excess load on the AWS checkpoint storage layer and may lead to AWS throttling.  The provided example handles this throttling with a random-backoff-retry strategy.
 
-- If no Kinesis checkpoint info exists when the input DStream starts, it will start either from the oldest record available (`InitialPositionInStream.TRIM_HORIZON`) or from the latest tip (`InitialPositionInStream.LATEST`).  This is configurable.
-  - `InitialPositionInStream.LATEST` could lead to missed records if data is added to the stream while no input DStreams are running (and no checkpoint info is being stored).
-  - `InitialPositionInStream.TRIM_HORIZON` may lead to duplicate processing of records where the impact is dependent on checkpoint frequency and processing idempotency.
+- If no Kinesis checkpoint info exists when the input DStream starts, it will start either from the oldest record available (`KinesisInitialPositions.TrimHorizon`), or from the latest tip (`KinesisInitialPositions.Latest`), or (except Python) from the position denoted by the provided UTC timestamp (`KinesisInitialPositions.AtTimestamp(Date timestamp)`).  This is configurable.
+  - `KinesisInitialPositions.Latest` could lead to missed records if data is added to the stream while no input DStreams are running (and no checkpoint info is being stored).
+  - `KinesisInitialPositions.TrimHorizon` may lead to duplicate processing of records where the impact is dependent on checkpoint frequency and processing idempotency.
 
 #### Kinesis retry configuration
  - `spark.streaming.kinesis.retry.waitTime` : Wait time between Kinesis retries as a duration string. When reading from Amazon Kinesis, users may hit `ProvisionedThroughputExceededException`'s, when consuming faster than 5 transactions/second or, exceeding the maximum read rate of 2 MiB/second. This configuration can be tweaked to increase the sleep between fetches when a fetch fails to reduce these exceptions. Default is "100ms".

From cc8d356e4faf1b581d330119c42ef73df466e828 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Mon, 24 Feb 2020 20:18:10 -0600
Subject: [PATCH 136/185] [SPARK-30939][ML] Correctly set output col when
 StringIndexer.setOutputCols is used

### What changes were proposed in this pull request?

Set the supplied output col name as intended when StringIndexer transforms an input after setOutputCols is used.

### Why are the changes needed?

The output col names are wrong otherwise and downstream pipeline components fail.

### Does this PR introduce any user-facing change?

Yes in the sense that it fixes incorrect behavior, otherwise no.

### How was this patch tested?

Existing tests plus new direct tests of the schema.

Closes #27684 from srowen/SPARK-30939.

Authored-by: Sean Owen <srowen@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../apache/spark/ml/feature/StringIndexer.scala |  2 +-
 .../spark/ml/feature/StringIndexerSuite.scala   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 9f9f097a26ead..be32f44287b6a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -107,7 +107,7 @@ private[feature] trait StringIndexerBase extends Params with HasHandleInvalid wi
         s"but got $inputDataType.")
     require(schema.fields.forall(_.name != outputColName),
       s"Output column $outputColName already exists.")
-    NominalAttribute.defaultAttr.withName($(outputCol)).toStructField()
+    NominalAttribute.defaultAttr.withName(outputColName).toStructField()
   }
 
   /** Validates and transforms the input schema. */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index b5ce2bace98f6..948140897d8cc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -96,6 +96,23 @@ class StringIndexerSuite extends MLTest with DefaultReadWriteTest {
     }
   }
 
+  test("StringIndexer.transformSchema)") {
+    val idxToStr = new StringIndexer().setInputCol("input").setOutputCol("output")
+    val inSchema = StructType(Seq(StructField("input", StringType)))
+    val outSchema = idxToStr.transformSchema(inSchema)
+    assert(outSchema("output").dataType === DoubleType)
+  }
+
+  test("StringIndexer.transformSchema multi col") {
+    val idxToStr = new StringIndexer().setInputCols(Array("input", "input2")).
+      setOutputCols(Array("output", "output2"))
+    val inSchema = StructType(Seq(StructField("input", StringType),
+      StructField("input2", StringType)))
+    val outSchema = idxToStr.transformSchema(inSchema)
+    assert(outSchema("output").dataType === DoubleType)
+    assert(outSchema("output2").dataType === DoubleType)
+  }
+
   test("StringIndexerUnseen") {
     val data = Seq((0, "a"), (1, "b"), (4, "b"))
     val data2 = Seq((0, "a"), (1, "b"), (2, "c"), (3, "d"))

From 7911de9d10e39960fcdda379815baf7386b8c34e Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Tue, 25 Feb 2020 11:39:11 +0900
Subject: [PATCH 137/185] [SPARK-30887][CORE][DOC] Add version information to
 the configuration of Deploy

### What changes were proposed in this pull request?
1.Add version information to the configuration of `Deploy`.
2.Update the docs of `Deploy`.

I sorted out some information show below.

Item name | Since version | JIRA ID | Commit ID | Note
-- | -- | -- | -- | --
spark.deploy.recoveryMode | 0.8.1 | None | d66c01f2b6defb3db6c1be99523b734a4d960532#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.deploy.recoveryMode.factory | 1.2.0 | SPARK-1830 |		deefd9d7377a8091a1d184b99066febd0e9f6afd#diff-29dffdccd5a7f4c8b496c293e87c8668 | This configuration appears in branch-1.3, but the version number in the pom.xml file corresponding to the commit is 1.2.0-SNAPSHOT
spark.deploy.recoveryDirectory | 0.8.1 | None |			d66c01f2b6defb3db6c1be99523b734a4d960532#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.deploy.zookeeper.url | 0.8.1 | None |			d66c01f2b6defb3db6c1be99523b734a4d960532#diff-4457313ca662a1cd60197122d924585c |
spark.deploy.zookeeper.dir | 0.8.1 | None | d66c01f2b6defb3db6c1be99523b734a4d960532#diff-a84228cb45c7d5bd93305a1f5bf720b6 |
spark.deploy.retainedApplications | 0.8.0 | None | 46eecd110a4017ea0c86cbb1010d0ccd6a5eb2ef#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.deploy.retainedDrivers | 1.1.0 | None | 7446f5ff93142d2dd5c79c63fa947f47a1d4db8b#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.dead.worker.persistence | 0.8.0 | None | 46eecd110a4017ea0c86cbb1010d0ccd6a5eb2ef#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.deploy.maxExecutorRetries | 1.6.3 | SPARK-16956 | ace458f0330f22463ecf7cbee7c0465e10fba8a8#diff-29dffdccd5a7f4c8b496c293e87c8668 |
spark.deploy.spreadOut | 0.6.1 | None | bb2b9ff37cd2503cc6ea82c5dd395187b0910af0#diff-0e7ae91819fc8f7b47b0f97be7116325 |
spark.deploy.defaultCores | 0.9.0 | None | d8bcc8e9a095c1b20dd7a17b6535800d39bff80e#diff-29dffdccd5a7f4c8b496c293e87c8668 |

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27668 from beliefer/add-version-to-deploy-config.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../org/apache/spark/internal/config/Deploy.scala     | 11 +++++++++++
 docs/configuration.md                                 |  5 ++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala
index ceab957b36634..d494c5ec019c7 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala
@@ -19,48 +19,59 @@ package org.apache.spark.internal.config
 
 private[spark] object Deploy {
   val RECOVERY_MODE = ConfigBuilder("spark.deploy.recoveryMode")
+    .version("0.8.1")
     .stringConf
     .createWithDefault("NONE")
 
   val RECOVERY_MODE_FACTORY = ConfigBuilder("spark.deploy.recoveryMode.factory")
+    .version("1.2.0")
     .stringConf
     .createWithDefault("")
 
   val RECOVERY_DIRECTORY = ConfigBuilder("spark.deploy.recoveryDirectory")
+    .version("0.8.1")
     .stringConf
     .createWithDefault("")
 
   val ZOOKEEPER_URL = ConfigBuilder("spark.deploy.zookeeper.url")
     .doc(s"When `${RECOVERY_MODE.key}` is set to ZOOKEEPER, this " +
       "configuration is used to set the zookeeper URL to connect to.")
+    .version("0.8.1")
     .stringConf
     .createOptional
 
   val ZOOKEEPER_DIRECTORY = ConfigBuilder("spark.deploy.zookeeper.dir")
+    .version("0.8.1")
     .stringConf
     .createOptional
 
   val RETAINED_APPLICATIONS = ConfigBuilder("spark.deploy.retainedApplications")
+    .version("0.8.0")
     .intConf
     .createWithDefault(200)
 
   val RETAINED_DRIVERS = ConfigBuilder("spark.deploy.retainedDrivers")
+    .version("1.1.0")
     .intConf
     .createWithDefault(200)
 
   val REAPER_ITERATIONS = ConfigBuilder("spark.dead.worker.persistence")
+    .version("0.8.0")
     .intConf
     .createWithDefault(15)
 
   val MAX_EXECUTOR_RETRIES = ConfigBuilder("spark.deploy.maxExecutorRetries")
+    .version("1.6.3")
     .intConf
     .createWithDefault(10)
 
   val SPREAD_OUT_APPS = ConfigBuilder("spark.deploy.spreadOut")
+    .version("0.6.1")
     .booleanConf
     .createWithDefault(true)
 
   val DEFAULT_CORES = ConfigBuilder("spark.deploy.defaultCores")
+    .version("0.9.0")
     .intConf
     .createWithDefault(Int.MaxValue)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 2421e00f39068..86ec35b411578 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2597,22 +2597,25 @@ Spark subsystems.
 ### Deploy
 
 <table class="table">
-  <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+  <tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
   <tr>
     <td><code>spark.deploy.recoveryMode</code></td>
     <td>NONE</td>
     <td>The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches.
     This is only applicable for cluster mode when running with Standalone or Mesos.</td>
+    <td>0.8.1</td>
   </tr>
   <tr>
     <td><code>spark.deploy.zookeeper.url</code></td>
     <td>None</td>
     <td>When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.</td>
+    <td>0.8.1</td>
   </tr>
   <tr>
     <td><code>spark.deploy.zookeeper.dir</code></td>
     <td>None</td>
     <td>When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.</td>
+    <td>0.8.1</td>
   </tr>
 </table>
 

From 3126557b0763998ff893bf382089467f2a0d6ae5 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Tue, 25 Feb 2020 12:28:31 +0800
Subject: [PATCH 138/185] [SPARK-30936][CORE] Set FAIL_ON_UNKNOWN_PROPERTIES to
 false by default to parse Spark events

### What changes were proposed in this pull request?

Set `FAIL_ON_UNKNOWN_PROPERTIES` to `false` in `JsonProtocol` to allow ignore unknown fields in a Spark event. After this change, if we add new fields to a Spark event parsed by `ObjectMapper`, the event json string generated by a new Spark version can still be read by an old Spark History Server.

Since Spark History Server is an extra service, it usually takes time to upgrade, and it's possible that a Spark application is upgraded before SHS. Forwards-compatibility will allow an old SHS to support new Spark applications (may lose some new features but most of functions should still work).

### Why are the changes needed?

`JsonProtocol` is supposed to provide strong backwards-compatibility and forwards-compatibility guarantees: any version of Spark should be able to read JSON output written by any other version, including newer versions.

However, the forwards-compatibility guarantee is broken for events parsed by `ObjectMapper`. If a new field is added to an event parsed by `ObjectMapper` (e.g., https://github.com/apache/spark/commit/6dc5921e66d56885b95c07e56e687f9f6c1eaca7#diff-dc5c7a41fbb7479cef48b67eb41ad254R33), the event json string generated by a new Spark version cannot be parsed by an old version of SHS right now.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

The new added tests.

Closes #27680 from zsxwing/SPARK-30936.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/JsonProtocol.scala  |  3 +-
 .../apache/spark/util/JsonProtocolSuite.scala | 24 +++++++++++++++
 .../StreamingQueryListenerSuite.scala         | 29 ++++++++++---------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 53824735d2fc5..9254ac94005f1 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -22,7 +22,7 @@ import java.util.{Properties, UUID}
 import scala.collection.JavaConverters._
 import scala.collection.Map
 
-import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import org.json4s.DefaultFormats
 import org.json4s.JsonAST._
@@ -59,6 +59,7 @@ private[spark] object JsonProtocol {
   private implicit val format = DefaultFormats
 
   private val mapper = new ObjectMapper().registerModule(DefaultScalaModule)
+    .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
 
   /** ------------------------------------------------- *
    * JSON serialization methods for SparkListenerEvents |
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index edc0662a0f73e..eb7f3079bee36 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -483,6 +483,28 @@ class JsonProtocolSuite extends SparkFunSuite {
     testAccumValue(Some("anything"), blocks, JString(blocks.toString))
     testAccumValue(Some("anything"), 123, JString("123"))
   }
+
+  test("SPARK-30936: forwards compatibility - ignore unknown fields") {
+    val expected = TestListenerEvent("foo", 123)
+    val unknownFieldsJson =
+      """{
+        |  "Event" : "org.apache.spark.util.TestListenerEvent",
+        |  "foo" : "foo",
+        |  "bar" : 123,
+        |  "unknown" : "unknown"
+        |}""".stripMargin
+    assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected)
+  }
+
+  test("SPARK-30936: backwards compatibility - set default values for missing fields") {
+    val expected = TestListenerEvent("foo", 0)
+    val unknownFieldsJson =
+      """{
+        |  "Event" : "org.apache.spark.util.TestListenerEvent",
+        |  "foo" : "foo"
+        |}""".stripMargin
+    assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected)
+  }
 }
 
 
@@ -2313,3 +2335,5 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |}
     """.stripMargin
 }
+
+case class TestListenerEvent(foo: String, bar: Int) extends SparkListenerEvent
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 9d0f829ac9684..6bb1646becf22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -382,28 +382,27 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_0") {
     // query-event-logs-version-2.0.0.txt has all types of events generated by
-    // Structured Streaming in Spark 2.0.0.
+    // Structured Streaming in Spark 2.0.0. Because we renamed the classes,
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
     // to verify that we can skip broken jsons generated by Structured Streaming.
-    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt")
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt", 1)
   }
 
-  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_1") {
     // query-event-logs-version-2.0.1.txt has all types of events generated by
-    // Structured Streaming in Spark 2.0.1.
+    // Structured Streaming in Spark 2.0.1. Because we renamed the classes,
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
     // to verify that we can skip broken jsons generated by Structured Streaming.
-    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt")
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt", 1)
   }
 
-  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.2") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_2") {
     // query-event-logs-version-2.0.2.txt has all types of events generated by
-    // Structured Streaming in Spark 2.0.2.
-    // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
-    // to verify that we can skip broken jsons generated by Structured Streaming.
-    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt")
+    // Structured Streaming in Spark 2.0.2. SPARK-18516 refactored Structured Streaming query events
+    // in 2.1.0. This test is to verify we are able to load events generated by Spark 2.0.2.
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt", 5)
   }
 
   test("listener propagates observable metrics") {
@@ -463,7 +462,9 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = {
+  private def testReplayListenerBusWithBorkenEventJsons(
+      fileName: String,
+      expectedEventSize: Int): Unit = {
     val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName")
     val events = mutable.ArrayBuffer[SparkListenerEvent]()
     try {
@@ -479,8 +480,8 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
       replayer.addListener(new SparkListener {})
       replayer.replay(input, fileName)
       // SparkListenerApplicationEnd is the only valid event
-      assert(events.size === 1)
-      assert(events(0).isInstanceOf[SparkListenerApplicationEnd])
+      assert(events.size === expectedEventSize)
+      assert(events.last.isInstanceOf[SparkListenerApplicationEnd])
     } finally {
       input.close()
     }

From 0fd4fa70c8e0595dac423c203b1c61ddcc70565e Mon Sep 17 00:00:00 2001
From: Terry Kim <yuminkim@gmail.com>
Date: Tue, 25 Feb 2020 13:04:28 +0800
Subject: [PATCH 139/185] [SPARK-30885][SQL] V1 table name should be fully
 qualified if catalog name is provided

### What changes were proposed in this pull request?

For the following:
```
CREATE TABLE t USING json AS SELECT 1 AS i
SELECT * FROM spark_catalog.t
```
`spark_catalog.t` is resolved to `spark_catalog.default.t` assuming the current namespace is `default`. However, this is not consistent with V2 behavior where the namespace must be specified if the catalog name is provided. This PR proposes to fix this inconsistency.

### Why are the changes needed?

To be consistent with V2 table naming scheme in SQL commands.

### Does this PR introduce any user-facing change?

Yes, now the user has to specify the namespace if the catalog name is provided. For example,
```
SELECT * FROM spark_catalog.t # Will throw AnalysisException with 'Session catalog cannot have an empty namespace: spark_catalog.t'
SELECT * FROM spark_catalog.default.t # OK
```

### How was this patch tested?

Added new tests

Closes #27642 from imback82/disallow_spark_catalog_wihtout_db.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../analysis/ResolveSessionCatalog.scala      |  51 ++--
 .../datasources/v2/V2SessionCatalog.scala     |   7 +-
 .../sql-tests/results/describe.sql.out        |   2 +-
 .../sql-tests/results/explain.sql.out         |   2 +-
 .../results/postgreSQL/create_view.sql.out    |  24 +-
 .../results/show-create-table.sql.out         |  24 +-
 .../spark/sql/ShowCreateTableSuite.scala      |   2 +-
 .../spark/sql/connector/AlterTableTests.scala | 237 +++++++++++-------
 ...SourceV2DataFrameSessionCatalogSuite.scala |   7 +-
 .../DataSourceV2SQLSessionCatalogSuite.scala  |   4 +-
 .../sql/connector/DataSourceV2SQLSuite.scala  |  83 +++---
 .../SupportsCatalogOptionsSuite.scala         |  16 +-
 .../connector/TestV2SessionCatalogBase.scala  |  23 +-
 .../spark/sql/execution/SQLViewSuite.scala    |   6 +-
 .../sql/execution/command/DDLSuite.scala      |  18 +-
 .../spark/sql/hive/CachedTableSuite.scala     |  13 +-
 .../sql/hive/HiveShowCreateTableSuite.scala   |   2 +-
 .../sql/hive/execution/HiveSQLViewSuite.scala |   4 +-
 18 files changed, 289 insertions(+), 236 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
index 6f9e666012a09..95dc95b14192d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -250,20 +250,9 @@ class ResolveSessionCatalog(
     case DescribeRelation(ResolvedView(ident), partitionSpec, isExtended) =>
       DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended)
 
-    case DescribeColumnStatement(
-        SessionCatalogAndTable(catalog, tbl), colNameParts, isExtended) =>
-      loadTable(catalog, tbl.asIdentifier).collect {
-        // `V1Table` also includes permanent views.
-        case v1Table: V1Table =>
-          DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended)
-      }.getOrElse {
-        if (isTempView(tbl)) {
-          // v1 DESCRIBE COLUMN supports temp view.
-          DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended)
-        } else {
-          throw new AnalysisException("Describing columns is not supported for v2 tables.")
-        }
-      }
+    case DescribeColumnStatement(tbl, colNameParts, isExtended) =>
+      val name = parseTempViewOrV1Table(tbl, "Describing columns")
+      DescribeColumnCommand(name.asTableIdentifier, colNameParts, isExtended)
 
     // For CREATE TABLE [AS SELECT], we should use the v1 command if the catalog is resolved to the
     // session catalog and the table provider is not v2.
@@ -396,7 +385,7 @@ class ResolveSessionCatalog(
       }
 
     case AnalyzeColumnStatement(tbl, columnNames, allColumns) =>
-      val v1TableName = parseV1Table(tbl, "ANALYZE TABLE")
+      val v1TableName = parseTempViewOrV1Table(tbl, "ANALYZE TABLE")
       AnalyzeColumnCommand(v1TableName.asTableIdentifier, columnNames, allColumns)
 
     case RepairTableStatement(tbl) =>
@@ -415,8 +404,8 @@ class ResolveSessionCatalog(
         partition)
 
     case ShowCreateTableStatement(tbl, asSerde) if !asSerde =>
-      val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE")
-      ShowCreateTableCommand(v1TableName.asTableIdentifier)
+      val name = parseTempViewOrV1Table(tbl, "SHOW CREATE TABLE")
+      ShowCreateTableCommand(name.asTableIdentifier)
 
     case ShowCreateTableStatement(tbl, asSerde) if asSerde =>
       val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE AS SERDE")
@@ -449,20 +438,27 @@ class ResolveSessionCatalog(
         partitionSpec)
 
     case ShowColumnsStatement(tbl, ns) =>
+      if (ns.isDefined && ns.get.length > 1) {
+        throw new AnalysisException(
+          s"Namespace name should have only one part if specified: ${ns.get.quoted}")
+      }
+      // Use namespace only if table name doesn't specify it. If namespace is already specified
+      // in the table name, it's checked against the given namespace below.
+      val nameParts = if (ns.isDefined && tbl.length == 1) {
+        ns.get ++ tbl
+      } else {
+        tbl
+      }
       val sql = "SHOW COLUMNS"
-      val v1TableName = parseV1Table(tbl, sql).asTableIdentifier
+      val v1TableName = parseTempViewOrV1Table(nameParts, sql).asTableIdentifier
       val resolver = conf.resolver
       val db = ns match {
-        case Some(db) if (v1TableName.database.exists(!resolver(_, db.head))) =>
+        case Some(db) if v1TableName.database.exists(!resolver(_, db.head)) =>
           throw new AnalysisException(
             s"SHOW COLUMNS with conflicting databases: " +
               s"'${db.head}' != '${v1TableName.database.get}'")
         case _ => ns.map(_.head)
       }
-      if (ns.isDefined && ns.get.length > 1) {
-        throw new AnalysisException(
-          s"Namespace name should have only one part if specified: ${ns.get.quoted}")
-      }
       ShowColumnsCommand(db, v1TableName)
 
     case AlterTableRecoverPartitionsStatement(tbl) =>
@@ -659,14 +655,7 @@ class ResolveSessionCatalog(
   object SessionCatalogAndTable {
     def unapply(nameParts: Seq[String]): Option[(CatalogPlugin, Seq[String])] = nameParts match {
       case SessionCatalogAndIdentifier(catalog, ident) =>
-        if (nameParts.length == 1) {
-          // If there is only one name part, it means the current catalog is the session catalog.
-          // Here we return the original name part, to keep the error message unchanged for
-          // v1 commands.
-          Some(catalog -> nameParts)
-        } else {
-          Some(catalog -> ident.asMultipartIdentifier)
-        }
+        Some(catalog -> ident.asMultipartIdentifier)
       case _ => None
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala
index cef9b5f675889..2ed33b867183b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala
@@ -167,14 +167,15 @@ class V2SessionCatalog(catalog: SessionCatalog, conf: SQLConf)
   }
 
   implicit class TableIdentifierHelper(ident: Identifier) {
+    import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper
+
     def asTableIdentifier: TableIdentifier = {
       ident.namespace match {
         case Array(db) =>
           TableIdentifier(ident.name, Some(db))
-        case Array() =>
-          TableIdentifier(ident.name, Some(catalog.getCurrentDatabase))
         case _ =>
-          throw new NoSuchTableException(ident)
+          throw new NoSuchTableException(
+            s"V2 session catalog requires a single-part namespace: ${ident.quoted}")
       }
     }
   }
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index a7ef828e4d5d2..a7de033e3a1ac 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -561,7 +561,7 @@ struct<plan:string>
 -- !query output
 == Physical Plan ==
 Execute DescribeColumnCommand
-   +- DescribeColumnCommand `t`, [b], false
+   +- DescribeColumnCommand `default`.`t`, [b], false
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
index 40b26e6fb64ce..06226f1274863 100644
--- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out
@@ -879,7 +879,7 @@ Execute CreateViewCommand (1)
 Output: []
       
 (2) CreateViewCommand 
-Arguments: `explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView
+Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView
       
 (3) UnresolvedRelation 
 Arguments: [explain_temp1]
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out
index 436b33ce43980..85ce978657844 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out
@@ -266,7 +266,7 @@ CREATE VIEW v1_temp AS SELECT * FROM temp_table
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v1_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v1_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -371,7 +371,7 @@ CREATE VIEW v4_temp AS
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v4_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v4_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -383,7 +383,7 @@ CREATE VIEW v5_temp AS
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v5_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v5_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -542,7 +542,7 @@ CREATE VIEW v6_temp AS SELECT * FROM base_table WHERE id IN (SELECT id FROM temp
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v6_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v6_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -551,7 +551,7 @@ CREATE VIEW v7_temp AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM tem
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v7_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v7_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -560,7 +560,7 @@ CREATE VIEW v8_temp AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM temp
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v8_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v8_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -569,7 +569,7 @@ CREATE VIEW v9_temp AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `v9_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `temp_view_test`.`v9_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -678,7 +678,7 @@ CREATE VIEW temporal1 AS SELECT * FROM t1 CROSS JOIN tt
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `temporal1` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `testviewschm2`.`temporal1` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -719,7 +719,7 @@ CREATE VIEW temporal2 AS SELECT * FROM t1 INNER JOIN tt ON t1.num = tt.num2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `temporal2` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `testviewschm2`.`temporal2` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -760,7 +760,7 @@ CREATE VIEW temporal3 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `temporal3` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `testviewschm2`.`temporal3` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -801,7 +801,7 @@ CREATE VIEW temporal4 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 AND t
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `temporal4` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `testviewschm2`.`temporal4` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
@@ -810,7 +810,7 @@ CREATE VIEW temporal5 AS SELECT * FROM t1 WHERE num IN (SELECT num FROM t1 WHERE
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-Not allowed to create a permanent view `temporal5` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
+Not allowed to create a permanent view `testviewschm2`.`temporal5` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW;
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out
index e8ee07171651d..f5ca1eff9f0c3 100644
--- a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out
@@ -15,7 +15,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -44,7 +44,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -75,7 +75,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -105,7 +105,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -135,7 +135,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `b` STRING,
   `c` INT,
   `a` INT)
@@ -165,7 +165,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -197,7 +197,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -227,7 +227,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` INT,
   `b` STRING,
   `c` INT)
@@ -257,7 +257,7 @@ SHOW CREATE TABLE tbl
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE TABLE `tbl` (
+CREATE TABLE `default`.`tbl` (
   `a` FLOAT,
   `b` DECIMAL(10,0),
   `c` DECIMAL(10,0),
@@ -295,7 +295,7 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE VIEW `view_SPARK_30302`(
+CREATE VIEW `default`.`view_SPARK_30302`(
   `aaa`,
   `bbb`)
 AS SELECT a, b FROM tbl
@@ -324,7 +324,7 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE VIEW `view_SPARK_30302`(
+CREATE VIEW `default`.`view_SPARK_30302`(
   `aaa` COMMENT 'comment with \'quoted text\' for aaa',
   `bbb`)
 COMMENT 'This is a comment with \'quoted text\' for view'
@@ -354,7 +354,7 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE
 -- !query schema
 struct<createtab_stmt:string>
 -- !query output
-CREATE VIEW `view_SPARK_30302`(
+CREATE VIEW `default`.`view_SPARK_30302`(
   `aaa`,
   `bbb`)
 TBLPROPERTIES (
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala
index b3b94f8be0d17..04257642fac81 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala
@@ -173,7 +173,7 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils {
       val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>)"
       sql(s"$createTable USING json")
       val shownDDL = getShowDDL("SHOW CREATE TABLE t1")
-      assert(shownDDL == createTable)
+      assert(shownDDL == "CREATE TABLE `default`.`t1` (`a` STRUCT<`b`: STRING>)")
 
       checkCreateTable("t1")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
index d04a1fca6387c..afc51f45c54ec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala
@@ -67,9 +67,10 @@ trait AlterTableTests extends SharedSparkSession {
       assert(exc.getMessage.contains("Unsupported table change"))
       assert(exc.getMessage.contains("Cannot drop all fields")) // from the implementation
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("id", IntegerType))
     }
   }
@@ -80,9 +81,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN data string")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("id", IntegerType).add("data", StringType))
     }
   }
@@ -93,9 +95,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN data string NOT NULL")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === StructType(Seq(
         StructField("id", IntegerType),
         StructField("data", StringType, nullable = false))))
@@ -108,9 +111,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN data string COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === StructType(Seq(
         StructField("id", IntegerType),
         StructField("data", StringType).withComment("doc"))))
@@ -136,12 +140,13 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (point struct<x: int>) USING $v2Format")
 
       sql(s"ALTER TABLE $t ADD COLUMN a string FIRST")
-      assert(getTableMetadata(t).schema == new StructType()
+      val tableName = fullTableName(t)
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", StringType)
         .add("point", new StructType().add("x", IntegerType)))
 
       sql(s"ALTER TABLE $t ADD COLUMN b string AFTER point")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", StringType)
         .add("point", new StructType().add("x", IntegerType))
         .add("b", StringType))
@@ -151,7 +156,7 @@ trait AlterTableTests extends SharedSparkSession {
       assert(e1.getMessage().contains("Couldn't find the reference column"))
 
       sql(s"ALTER TABLE $t ADD COLUMN point.y int FIRST")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", StringType)
         .add("point", new StructType()
           .add("y", IntegerType)
@@ -159,7 +164,7 @@ trait AlterTableTests extends SharedSparkSession {
         .add("b", StringType))
 
       sql(s"ALTER TABLE $t ADD COLUMN point.z int AFTER x")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", StringType)
         .add("point", new StructType()
           .add("y", IntegerType)
@@ -179,7 +184,8 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (a string, b int, point struct<x: double, y: double>) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMNS (x int AFTER a, y int AFTER x, z int AFTER y)")
 
-      assert(getTableMetadata(t).schema === new StructType()
+      val tableName = fullTableName(t)
+      assert(getTableMetadata(tableName).schema === new StructType()
         .add("a", StringType)
         .add("x", IntegerType)
         .add("y", IntegerType)
@@ -190,7 +196,7 @@ trait AlterTableTests extends SharedSparkSession {
           .add("y", DoubleType)))
 
       sql(s"ALTER TABLE $t ADD COLUMNS (point.z double AFTER x, point.zz double AFTER z)")
-      assert(getTableMetadata(t).schema === new StructType()
+      assert(getTableMetadata(tableName).schema === new StructType()
         .add("a", StringType)
         .add("x", IntegerType)
         .add("y", IntegerType)
@@ -215,9 +221,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMNS data string COMMENT 'doc', ts timestamp")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === StructType(Seq(
         StructField("id", IntegerType),
         StructField("data", StringType).withComment("doc"),
@@ -231,9 +238,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, point struct<x: double, y: double>) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN point.z double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -250,9 +258,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN points.key.z double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StructType(Seq(
@@ -269,9 +278,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN points.value.z double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StringType, StructType(Seq(
@@ -287,9 +297,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: double, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN points.element.z double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -305,9 +316,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN points array<struct<x: double, y: double>>")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -322,9 +334,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: double, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t ADD COLUMN points.element.z double COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -378,8 +391,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN id TYPE bigint")
 
-      val table = getTableMetadata(t)
-      assert(table.name === fullTableName(t))
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
+
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("id", LongType))
     }
   }
@@ -399,13 +414,14 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id bigint NOT NULL) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN id SET NOT NULL")
 
-      val table = getTableMetadata(t)
-      assert(table.name === fullTableName(t))
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("id", LongType, nullable = false))
 
       sql(s"ALTER TABLE $t ALTER COLUMN id DROP NOT NULL")
-      val table2 = getTableMetadata(t)
-      assert(table2.name === fullTableName(t))
+      val table2 = getTableMetadata(tableName)
+      assert(table2.name === tableName)
       assert(table2.schema === new StructType().add("id", LongType))
 
       val e = intercept[AnalysisException] {
@@ -421,8 +437,9 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, point struct<x: float, y: double>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN point.x TYPE double")
 
-      val table = getTableMetadata(t)
-      assert(table.name === fullTableName(t))
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -443,9 +460,10 @@ trait AlterTableTests extends SharedSparkSession {
       assert(exc.getMessage.contains("point"))
       assert(exc.getMessage.contains("update a struct by updating its fields"))
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -465,9 +483,10 @@ trait AlterTableTests extends SharedSparkSession {
 
       assert(exc.getMessage.contains("update the element by updating points.element"))
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(IntegerType)))
@@ -480,9 +499,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<int>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.element TYPE long")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(LongType)))
@@ -500,9 +520,10 @@ trait AlterTableTests extends SharedSparkSession {
 
       assert(exc.getMessage.contains("update a map by updating m.key or m.value"))
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("m", MapType(StringType, IntegerType)))
@@ -515,9 +536,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, m map<string, int>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN m.value TYPE long")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("m", MapType(StringType, LongType)))
@@ -531,9 +553,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.key.x TYPE double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StructType(Seq(
@@ -549,9 +572,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.value.x TYPE double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StringType, StructType(Seq(
@@ -566,9 +590,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: float, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.element.x TYPE double")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -625,9 +650,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN id COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === StructType(Seq(StructField("id", IntegerType).withComment("doc"))))
     }
   }
@@ -638,7 +664,8 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (a int, b int, point struct<x: int, y: int, z: int>) USING $v2Format")
 
       sql(s"ALTER TABLE $t ALTER COLUMN b FIRST")
-      assert(getTableMetadata(t).schema == new StructType()
+      val tableName = fullTableName(t)
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("b", IntegerType)
         .add("a", IntegerType)
         .add("point", new StructType()
@@ -647,7 +674,7 @@ trait AlterTableTests extends SharedSparkSession {
           .add("z", IntegerType)))
 
       sql(s"ALTER TABLE $t ALTER COLUMN b AFTER point")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", IntegerType)
         .add("point", new StructType()
           .add("x", IntegerType)
@@ -660,7 +687,7 @@ trait AlterTableTests extends SharedSparkSession {
       assert(e1.getMessage.contains("Couldn't resolve positional argument"))
 
       sql(s"ALTER TABLE $t ALTER COLUMN point.y FIRST")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", IntegerType)
         .add("point", new StructType()
           .add("y", IntegerType)
@@ -669,7 +696,7 @@ trait AlterTableTests extends SharedSparkSession {
         .add("b", IntegerType))
 
       sql(s"ALTER TABLE $t ALTER COLUMN point.y AFTER z")
-      assert(getTableMetadata(t).schema == new StructType()
+      assert(getTableMetadata(tableName).schema == new StructType()
         .add("a", IntegerType)
         .add("point", new StructType()
           .add("x", IntegerType)
@@ -693,9 +720,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, point struct<x: double, y: double>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN point.y COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -711,9 +739,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.key.y COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StructType(Seq(
@@ -729,9 +758,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.value.y COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StringType, StructType(Seq(
@@ -746,9 +776,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: double, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t ALTER COLUMN points.element.y COMMENT 'doc'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -791,9 +822,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t RENAME COLUMN id TO user_id")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("user_id", IntegerType))
     }
   }
@@ -804,9 +836,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, point struct<x: double, y: double>) USING $v2Format")
       sql(s"ALTER TABLE $t RENAME COLUMN point.y TO t")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -822,9 +855,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t RENAME COLUMN point.key.y TO t")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", MapType(StructType(Seq(
@@ -840,9 +874,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t RENAME COLUMN points.value.y TO t")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StringType, StructType(Seq(
@@ -857,9 +892,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: double, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t RENAME COLUMN points.element.y TO t")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -933,9 +969,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, data string) USING $v2Format")
       sql(s"ALTER TABLE $t DROP COLUMN data")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType().add("id", IntegerType))
     }
   }
@@ -947,9 +984,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t DROP COLUMN point.t")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", StructType(Seq(
@@ -965,9 +1003,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t DROP COLUMN point.key.y")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("point", MapType(StructType(Seq(
@@ -982,9 +1021,10 @@ trait AlterTableTests extends SharedSparkSession {
         s"USING $v2Format")
       sql(s"ALTER TABLE $t DROP COLUMN points.value.y")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", MapType(StringType, StructType(Seq(
@@ -998,9 +1038,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int, points array<struct<x: double, y: double>>) USING $v2Format")
       sql(s"ALTER TABLE $t DROP COLUMN points.element.y")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === new StructType()
         .add("id", IntegerType)
         .add("points", ArrayType(StructType(Seq(
@@ -1042,9 +1083,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t SET LOCATION 's3://bucket/path'")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.properties ===
         withDefaultOwnership(Map("provider" -> v2Format, "location" -> "s3://bucket/path")).asJava)
     }
@@ -1069,9 +1111,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (id int) USING $v2Format")
       sql(s"ALTER TABLE $t SET TBLPROPERTIES ('test'='34')")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.properties ===
         withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava)
     }
@@ -1082,17 +1125,18 @@ trait AlterTableTests extends SharedSparkSession {
     withTable(t) {
       sql(s"CREATE TABLE $t (id int) USING $v2Format TBLPROPERTIES('test' = '34')")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.properties ===
         withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava)
 
       sql(s"ALTER TABLE $t UNSET TBLPROPERTIES ('test')")
 
-      val updated = getTableMetadata(t)
+      val updated = getTableMetadata(tableName)
 
-      assert(updated.name === fullTableName(t))
+      assert(updated.name === tableName)
       assert(updated.properties === withDefaultOwnership(Map("provider" -> v2Format)).asJava)
     }
   }
@@ -1103,9 +1147,10 @@ trait AlterTableTests extends SharedSparkSession {
       sql(s"CREATE TABLE $t (col1 int, col2 int COMMENT 'c2') USING $v2Format")
       sql(s"ALTER TABLE $t REPLACE COLUMNS (col2 string, col3 int COMMENT 'c3')")
 
-      val table = getTableMetadata(t)
+      val tableName = fullTableName(t)
+      val table = getTableMetadata(tableName)
 
-      assert(table.name === fullTableName(t))
+      assert(table.name === tableName)
       assert(table.schema === StructType(Seq(
         StructField("col2", StringType),
         StructField("col3", IntegerType).withComment("c3"))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala
index 01caf8e2eb115..6b25d7c61663c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala
@@ -85,7 +85,7 @@ class DataSourceV2DataFrameSessionCatalogSuite
     withTable(t1) {
       spark.range(20).write.format(v2Format).option("path", "abc").saveAsTable(t1)
       val cat = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog]
-      val tableInfo = cat.loadTable(Identifier.of(Array.empty, t1))
+      val tableInfo = cat.loadTable(Identifier.of(Array("default"), t1))
       assert(tableInfo.properties().get("location") === "abc")
       assert(tableInfo.properties().get("provider") === v2Format)
     }
@@ -109,8 +109,7 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable
   }
 
   override def alterTable(ident: Identifier, changes: TableChange*): Table = {
-    val fullIdent = fullIdentifier(ident)
-    Option(tables.get(fullIdent)) match {
+    Option(tables.get(ident)) match {
       case Some(table) =>
         val properties = CatalogV2Util.applyPropertiesChanges(table.properties, changes)
         val schema = CatalogV2Util.applySchemaChanges(table.schema, changes)
@@ -123,7 +122,7 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable
         val newTable = new InMemoryTable(table.name, schema, table.partitioning, properties)
           .withData(table.data)
 
-        tables.put(fullIdent, newTable)
+        tables.put(ident, newTable)
 
         newTable
       case _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala
index b6997445013e5..249b27c28b072 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala
@@ -47,7 +47,7 @@ class DataSourceV2SQLSessionCatalogSuite
     val v2Catalog = spark.sessionState.catalogManager.currentCatalog
     val nameParts = spark.sessionState.sqlParser.parseMultipartIdentifier(tableName)
     v2Catalog.asInstanceOf[TableCatalog]
-      .loadTable(Identifier.of(Array.empty, nameParts.last))
+      .loadTable(Identifier.of(nameParts.init.toArray, nameParts.last))
   }
 
   test("SPARK-30697: catalog.isView doesn't throw an error for specialized identifiers") {
@@ -55,7 +55,7 @@ class DataSourceV2SQLSessionCatalogSuite
     withTable(t1) {
       sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format")
 
-      def idResolver(id: Identifier): Identifier = Identifier.of(Array.empty, id.name())
+      def idResolver(id: Identifier): Identifier = Identifier.of(Array("default"), id.name())
 
       InMemoryTableSessionCatalog.withCustomIdentifierResolver(idResolver) {
         // The following should not throw AnalysisException.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 2acd7e39fe949..f642114983293 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -172,7 +172,7 @@ class DataSourceV2SQLSuite
     spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING $v2Source")
 
     val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog
-    val table = testCatalog.loadTable(Identifier.of(Array(), "table_name"))
+    val table = testCatalog.loadTable(Identifier.of(Array("default"), "table_name"))
 
     assert(table.name == "default.table_name")
     assert(table.partitioning.isEmpty)
@@ -453,7 +453,7 @@ class DataSourceV2SQLSuite
     spark.sql(s"CREATE TABLE table_name USING $v2Source AS SELECT id, data FROM source")
 
     val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog
-    val table = testCatalog.loadTable(Identifier.of(Array(), "table_name"))
+    val table = testCatalog.loadTable(Identifier.of(Array("default"), "table_name"))
 
     assert(table.name == "default.table_name")
     assert(table.partitioning.isEmpty)
@@ -565,7 +565,7 @@ class DataSourceV2SQLSuite
     // The fact that the following line doesn't throw an exception means, the session catalog
     // can load the table.
     val t = catalog(SESSION_CATALOG_NAME).asTableCatalog
-      .loadTable(Identifier.of(Array.empty, "table_name"))
+      .loadTable(Identifier.of(Array("default"), "table_name"))
     assert(t.isInstanceOf[V1Table], "V1 table wasn't returned as an unresolved table")
   }
 
@@ -605,10 +605,10 @@ class DataSourceV2SQLSuite
   }
 
   test("DropTable: table qualified with the session catalog name") {
-    val ident = Identifier.of(Array(), "tbl")
+    val ident = Identifier.of(Array("default"), "tbl")
     sql("CREATE TABLE tbl USING json AS SELECT 1 AS i")
     assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true)
-    sql("DROP TABLE spark_catalog.tbl")
+    sql("DROP TABLE spark_catalog.default.tbl")
     assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === false)
   }
 
@@ -705,7 +705,6 @@ class DataSourceV2SQLSuite
 
     withTable("t") {
       sql("CREATE TABLE t USING json AS SELECT 1 AS i")
-      checkAnswer(sql("select default.t.i from spark_catalog.t"), Row(1))
       checkAnswer(sql("select t.i from spark_catalog.default.t"), Row(1))
       checkAnswer(sql("select default.t.i from spark_catalog.default.t"), Row(1))
 
@@ -1378,7 +1377,12 @@ class DataSourceV2SQLSuite
     val sessionCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog
 
     def checkPartitioning(cat: TableCatalog, partition: String): Unit = {
-      val table = cat.loadTable(Identifier.of(Array.empty, "tbl"))
+      val namespace = if (cat.name == SESSION_CATALOG_NAME) {
+        Array("default")
+      } else {
+        Array[String]()
+      }
+      val table = cat.loadTable(Identifier.of(namespace, "tbl"))
       val partitions = table.partitioning().map(_.references())
       assert(partitions.length === 1)
       val fieldNames = partitions.flatMap(_.map(_.fieldNames()))
@@ -1414,48 +1418,48 @@ class DataSourceV2SQLSuite
   }
 
   test("tableCreation: duplicate column names in the table definition") {
-    val errorMsg = "Found duplicate column(s) in the table definition of t"
+    val errorMsg = "Found duplicate column(s) in the table definition of"
     Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
       withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
         assertAnalysisError(
           s"CREATE TABLE t ($c0 INT, $c1 INT) USING $v2Source",
-          errorMsg
+          s"$errorMsg default.t"
         )
         assertAnalysisError(
           s"CREATE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source",
-          errorMsg
+          s"$errorMsg t"
         )
         assertAnalysisError(
           s"CREATE OR REPLACE TABLE t ($c0 INT, $c1 INT) USING $v2Source",
-          errorMsg
+          s"$errorMsg default.t"
         )
         assertAnalysisError(
           s"CREATE OR REPLACE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source",
-          errorMsg
+          s"$errorMsg t"
         )
       }
     }
   }
 
   test("tableCreation: duplicate nested column names in the table definition") {
-    val errorMsg = "Found duplicate column(s) in the table definition of t"
+    val errorMsg = "Found duplicate column(s) in the table definition of"
     Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
       withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
         assertAnalysisError(
           s"CREATE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source",
-          errorMsg
+          s"$errorMsg default.t"
         )
         assertAnalysisError(
           s"CREATE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source",
-          errorMsg
+          s"$errorMsg t"
         )
         assertAnalysisError(
           s"CREATE OR REPLACE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source",
-          errorMsg
+          s"$errorMsg default.t"
         )
         assertAnalysisError(
           s"CREATE OR REPLACE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source",
-          errorMsg
+          s"$errorMsg t"
         )
       }
     }
@@ -1786,7 +1790,7 @@ class DataSourceV2SQLSuite
     withTable(t) {
       spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo")
       testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS")
-      testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS FOR ALL COLUMNS")
+      testV1CommandSupportingTempView("ANALYZE TABLE", s"$t COMPUTE STATISTICS FOR ALL COLUMNS")
     }
   }
 
@@ -1850,7 +1854,7 @@ class DataSourceV2SQLSuite
     val t = "testcat.ns1.ns2.tbl"
     withTable(t) {
       spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo")
-      testV1Command("SHOW CREATE TABLE", t)
+      testV1CommandSupportingTempView("SHOW CREATE TABLE", t)
     }
   }
 
@@ -1883,8 +1887,8 @@ class DataSourceV2SQLSuite
     withTable(t) {
       spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo")
 
-      testV1Command("SHOW COLUMNS", s"FROM $t")
-      testV1Command("SHOW COLUMNS", s"IN $t")
+      testV1CommandSupportingTempView("SHOW COLUMNS", s"FROM $t")
+      testV1CommandSupportingTempView("SHOW COLUMNS", s"IN $t")
 
       val e3 = intercept[AnalysisException] {
         sql(s"SHOW COLUMNS FROM tbl IN testcat.ns1.ns2")
@@ -2121,23 +2125,39 @@ class DataSourceV2SQLSuite
     withTable("t") {
       sql("CREATE TABLE t USING json AS SELECT 1 AS i")
       checkAnswer(sql("select * from t"), Row(1))
-      checkAnswer(sql("select * from spark_catalog.t"), Row(1))
       checkAnswer(sql("select * from spark_catalog.default.t"), Row(1))
     }
   }
 
+  test("SPARK-30885: v1 table name should be fully qualified") {
+    def assertWrongTableIdent(): Unit = {
+      withTable("t") {
+        sql("CREATE TABLE t USING json AS SELECT 1 AS i")
+        val e = intercept[AnalysisException] {
+          sql("select * from spark_catalog.t")
+        }
+        assert(e.message.contains("Table or view not found: spark_catalog.t"))
+      }
+    }
+
+    assertWrongTableIdent()
+    // unset this config to use the default v2 session catalog.
+    spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
+    assertWrongTableIdent()
+  }
+
   test("SPARK-30259: session catalog can be specified in CREATE TABLE AS SELECT command") {
     withTable("tbl") {
-      val ident = Identifier.of(Array(), "tbl")
-      sql("CREATE TABLE spark_catalog.tbl USING json AS SELECT 1 AS i")
+      val ident = Identifier.of(Array("default"), "tbl")
+      sql("CREATE TABLE spark_catalog.default.tbl USING json AS SELECT 1 AS i")
       assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true)
     }
   }
 
   test("SPARK-30259: session catalog can be specified in CREATE TABLE command") {
     withTable("tbl") {
-      val ident = Identifier.of(Array(), "tbl")
-      sql("CREATE TABLE spark_catalog.tbl (col string) USING json")
+      val ident = Identifier.of(Array("default"), "tbl")
+      sql("CREATE TABLE spark_catalog.default.tbl (col string) USING json")
       assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true)
     }
   }
@@ -2146,7 +2166,7 @@ class DataSourceV2SQLSuite
     // unset this config to use the default v2 session catalog.
     spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
 
-    withTable("spark_catalog.t", "testcat.ns.t") {
+    withTable("spark_catalog.default.t", "testcat.ns.t") {
       sql("CREATE TABLE t USING parquet AS SELECT 1")
       sql("CREATE TABLE testcat.ns.t USING parquet AS SELECT 2")
 
@@ -2168,17 +2188,18 @@ class DataSourceV2SQLSuite
 
     withTempView("t") {
       spark.range(10).createTempView("t")
-      withView(s"$sessionCatalogName.v") {
+      withView(s"$sessionCatalogName.default.v") {
         val e = intercept[AnalysisException] {
-          sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT * FROM t")
+          sql(s"CREATE VIEW $sessionCatalogName.default.v AS SELECT * FROM t")
         }
         assert(e.message.contains("referencing a temporary view"))
       }
     }
 
     withTempView("t") {
-      withView(s"$sessionCatalogName.v") {
-        sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT t1.col FROM t t1 JOIN ns1.ns2.t t2")
+      withView(s"$sessionCatalogName.default.v") {
+        sql(s"CREATE VIEW $sessionCatalogName.default.v " +
+          "AS SELECT t1.col FROM t t1 JOIN ns1.ns2.t t2")
         sql(s"USE $sessionCatalogName")
         // The view should read data from table `testcat.ns1.ns2.t` not the temp view.
         spark.range(10).createTempView("t")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala
index 7bff955b18360..9693a10f9afca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala
@@ -75,7 +75,12 @@ class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with
     withCatalogOption.foreach(cName => dfw.option("catalog", cName))
     dfw.partitionBy(partitionBy: _*).save()
 
-    val table = catalog(withCatalogOption.getOrElse(SESSION_CATALOG_NAME)).loadTable("t1")
+    val ident = if (withCatalogOption.isEmpty) {
+      Identifier.of(Array("default"), "t1")
+    } else {
+      Identifier.of(Array(), "t1")
+    }
+    val table = catalog(withCatalogOption.getOrElse(SESSION_CATALOG_NAME)).loadTable(ident)
     val namespace = withCatalogOption.getOrElse("default")
     assert(table.name() === s"$namespace.t1", "Table identifier was wrong")
     assert(table.partitioning().length === partitionBy.length, "Partitioning did not match")
@@ -134,7 +139,7 @@ class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with
     val dfw = df.write.format(format).mode(SaveMode.Ignore).option("name", "t1")
     dfw.save()
 
-    val table = catalog(SESSION_CATALOG_NAME).loadTable("t1")
+    val table = catalog(SESSION_CATALOG_NAME).loadTable(Identifier.of(Array("default"), "t1"))
     assert(table.partitioning().isEmpty, "Partitioning should be empty")
     assert(table.schema() === new StructType().add("id", LongType), "Schema did not match")
     assert(load("t1", None).count() === 0)
@@ -279,7 +284,12 @@ class CatalogSupportingInMemoryTableProvider
   override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = {
     val name = options.get("name")
     assert(name != null, "The name should be provided for this table")
-    Identifier.of(Array.empty, name)
+    val namespace = if (options.containsKey("catalog")) {
+      Array[String]()
+    } else {
+      Array("default")
+    }
+    Identifier.of(namespace, name)
   }
 
   override def extractCatalog(options: CaseInsensitiveStringMap): String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala
index 3f6ac0b7f8d3c..637cf2fd16515 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.JavaConverters._
 
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table}
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.types.StructType
@@ -41,23 +42,14 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating
       partitions: Array[Transform],
       properties: util.Map[String, String]): T
 
-  protected def fullIdentifier(ident: Identifier): Identifier = {
-    if (ident.namespace().isEmpty) {
-      Identifier.of(Array("default"), ident.name())
-    } else {
-      ident
-    }
-  }
-
   override def loadTable(ident: Identifier): Table = {
-    val fullIdent = fullIdentifier(ident)
-    if (tables.containsKey(fullIdent)) {
-      tables.get(fullIdent)
+    if (tables.containsKey(ident)) {
+      tables.get(ident)
     } else {
       // Table was created through the built-in catalog
-      val t = super.loadTable(fullIdent)
+      val t = super.loadTable(ident)
       val table = newTable(t.name(), t.schema(), t.partitioning(), t.properties())
-      tables.put(fullIdent, table)
+      tables.put(ident, table)
       table
     }
   }
@@ -69,13 +61,12 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating
       properties: util.Map[String, String]): Table = {
     val created = super.createTable(ident, schema, partitions, properties)
     val t = newTable(created.name(), schema, partitions, properties)
-    val fullIdent = fullIdentifier(ident)
-    tables.put(fullIdent, t)
+    tables.put(ident, t)
     t
   }
 
   override def dropTable(ident: Identifier): Boolean = {
-    tables.remove(fullIdentifier(ident))
+    tables.remove(ident)
     super.dropTable(ident)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 9a393f19ce9bb..575efec364812 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -79,7 +79,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
           var e = intercept[AnalysisException] {
             sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
           }.getMessage
-          assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+          assert(e.contains("Not allowed to create a permanent view `default`.`jtv1` by " +
             "referencing a temporary view temp_jtv1. " +
             "Please create a temp view instead by CREATE TEMP VIEW"))
 
@@ -88,8 +88,8 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
           e = intercept[AnalysisException] {
             sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
           }.getMessage
-          assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
-            s"a temporary view global_temp.global_temp_jtv1"))
+          assert(e.contains("Not allowed to create a permanent view `default`.`jtv1` by " +
+            "referencing a temporary view global_temp.global_temp_jtv1"))
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index dbf4b09403423..e4bf721b1d58d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -473,17 +473,12 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
       withEmptyDirInTablePath("tab1") { tableLoc =>
         val hiddenGarbageFile = new File(tableLoc.getCanonicalPath, ".garbage")
         hiddenGarbageFile.createNewFile()
-        val exMsg = "Can not create the managed table('`tab1`'). The associated location"
         val exMsgWithDefaultDB =
           "Can not create the managed table('`default`.`tab1`'). The associated location"
         var ex = intercept[AnalysisException] {
           sql(s"CREATE TABLE tab1 USING ${dataSource} AS SELECT 1, 'a'")
         }.getMessage
-        if (isUsingHiveMetastore) {
-          assert(ex.contains(exMsgWithDefaultDB))
-        } else {
-          assert(ex.contains(exMsg))
-        }
+        assert(ex.contains(exMsgWithDefaultDB))
 
         ex = intercept[AnalysisException] {
           sql(s"CREATE TABLE tab1 (col1 int, col2 string) USING ${dataSource}")
@@ -509,8 +504,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
         val ex = intercept[AnalysisException] {
           sql("ALTER TABLE tab1 RENAME TO tab2")
         }.getMessage
-        val expectedMsg = "Can not rename the managed table('`tab1`'). The associated location"
-        assert(ex.contains(expectedMsg))
+        assert(ex.contains(
+          "Can not rename the managed table('`default`.`tab1`'). The associated location"))
       }
     }
   }
@@ -640,7 +635,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
         val errMsg = intercept[AnalysisException] {
           sql(s"CREATE TABLE t($c0 INT, $c1 INT) USING parquet")
         }.getMessage
-        assert(errMsg.contains("Found duplicate column(s) in the table definition of `t`"))
+        assert(errMsg.contains(
+          "Found duplicate column(s) in the table definition of `default`.`t`"))
       }
     }
   }
@@ -649,7 +645,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int, b string) USING json PARTITIONED BY (c)")
     }
-    assert(e.message == "partition column c is not defined in table tbl, " +
+    assert(e.message == "partition column c is not defined in table default.tbl, " +
       "defined table columns are: a, b")
   }
 
@@ -657,7 +653,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int, b string) USING json CLUSTERED BY (c) INTO 4 BUCKETS")
     }
-    assert(e.message == "bucket column c is not defined in table tbl, " +
+    assert(e.message == "bucket column c is not defined in table default.tbl, " +
       "defined table columns are: a, b")
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 7b3fb68174234..fc793534641df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -102,10 +102,10 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
   }
 
   test("uncache of nonexistent tables") {
-    val expectedErrorMsg = "Table or view not found: nonexistentTable"
+    val expectedErrorMsg = "Table or view not found:"
     // make sure table doesn't exist
     var e = intercept[AnalysisException](spark.table("nonexistentTable")).getMessage
-    assert(e.contains(expectedErrorMsg))
+    assert(e.contains(s"$expectedErrorMsg nonexistentTable"))
     e = intercept[AnalysisException] {
       uncacheTable("nonexistentTable")
     }.getMessage
@@ -113,7 +113,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     e = intercept[AnalysisException] {
       sql("UNCACHE TABLE nonexistentTable")
     }.getMessage
-    assert(e.contains(expectedErrorMsg))
+    assert(e.contains(s"$expectedErrorMsg default.nonexistentTable"))
     sql("UNCACHE TABLE IF EXISTS nonexistentTable")
   }
 
@@ -403,13 +403,13 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
           // Cache the table 'cachedTable' in default db without qualified table name , and then
           // check whether the table is cached with expected name.
           sql("CACHE TABLE cachedTable OPTIONS('storageLevel' 'DISK_ONLY')")
-          assertCached(sql("SELECT * FROM cachedTable"), "`cachedTable`", DISK_ONLY)
+          assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY)
           assert(spark.catalog.isCached("cachedTable"), "Table 'cachedTable' should be cached.")
 
           // Refresh the table 'cachedTable' in default db with unqualified table name, and then
           // check whether the table is still cached with the same name.
           sql("REFRESH TABLE cachedTable")
-          assertCached(sql("SELECT * FROM cachedTable"), "`cachedTable`", DISK_ONLY)
+          assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY)
           assert(spark.catalog.isCached("cachedTable"),
             "Table 'cachedTable' should be cached after refreshing with its unqualified name.")
 
@@ -420,7 +420,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
           // 'default.cachedTable', instead of 'cachedTable'
           activateDatabase(db) {
             sql("REFRESH TABLE default.cachedTable")
-            assertCached(sql("SELECT * FROM default.cachedTable"), "`cachedTable`", DISK_ONLY)
+            assertCached(
+              sql("SELECT * FROM default.cachedTable"), "`default`.`cachedTable`", DISK_ONLY)
             assert(spark.catalog.isCached("default.cachedTable"),
               "Table 'cachedTable' should be cached after refreshing with its qualified name.")
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
index e5d572c90af38..99db1e3cbd18c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
@@ -219,7 +219,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet
       val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>) USING hive"
       sql(createTable)
       val shownDDL = getShowDDL("SHOW CREATE TABLE t1")
-      assert(shownDDL == createTable.dropRight(" USING hive".length))
+      assert(shownDDL == "CREATE TABLE `default`.`t1` (`a` STRUCT<`b`: STRING>)")
 
       checkCreateHiveTableOrView("t1")
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
index 5e6e114fc3fdc..fa43ff14fd796 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
@@ -80,8 +80,8 @@ class HiveSQLViewSuite extends SQLViewSuite with TestHiveSingleton {
             val e = intercept[AnalysisException] {
               sql(s"CREATE VIEW view1 AS SELECT $tempFunctionName(id) from tab1")
             }.getMessage
-            assert(e.contains("Not allowed to create a permanent view `view1` by referencing " +
-              s"a temporary function `$tempFunctionName`"))
+            assert(e.contains("Not allowed to create a permanent view `default`.`view1` by " +
+              s"referencing a temporary function `$tempFunctionName`"))
           }
         }
       }

From f152d2a0a80e2756dd620538a46b030dd5a6e630 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 25 Feb 2020 17:04:13 +0900
Subject: [PATCH 140/185] [SPARK-30944][BUILD] Update URL for Google Cloud
 Storage mirror of Maven Central

### What changes were proposed in this pull request?

This PR is a followup to #27307: per https://travis-ci.community/t/maven-builds-that-use-the-gcs-maven-central-mirror-should-update-their-paths/5926, the Google Cloud Storage mirror of Maven Central has updated its URLs: the new paths are updated more frequently. The new paths are listed on https://storage-download.googleapis.com/maven-central/index.html

This patch updates our build files to use these new URLs.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Existing build + tests.

Closes #27688 from JoshRosen/update-gcs-mirror-url.

Authored-by: Josh Rosen <joshrosen@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 pom.xml                                                       | 4 ++--
 project/SparkBuild.scala                                      | 2 +-
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 925fa28a291a4..b3750e4c3f442 100644
--- a/pom.xml
+++ b/pom.xml
@@ -253,7 +253,7 @@
         See https://storage-download.googleapis.com/maven-central/index.html
       -->
       <name>GCS Maven Central mirror</name>
-      <url>https://maven-central.storage-download.googleapis.com/repos/central/data/</url>
+      <url>https://maven-central.storage-download.googleapis.com/maven2/</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -284,7 +284,7 @@
         See https://storage-download.googleapis.com/maven-central/index.html
       -->
       <name>GCS Maven Central mirror</name>
-      <url>https://maven-central.storage-download.googleapis.com/repos/central/data/</url>
+      <url>https://maven-central.storage-download.googleapis.com/maven2/</url>
       <releases>
         <enabled>true</enabled>
       </releases>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index a07823cff2481..fcde1e958f35a 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -226,7 +226,7 @@ object SparkBuild extends PomBuild {
     resolvers := Seq(
       // Google Mirror of Maven Central, placed first so that it's used instead of flaky Maven Central.
       // See https://storage-download.googleapis.com/maven-central/index.html for more info.
-      "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/repos/central/data/",
+      "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/maven2/",
       DefaultMavenRepository,
       Resolver.mavenLocal,
       Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4bef66d0d3aa9..6db1fb3829309 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2136,7 +2136,7 @@ object SQLConf {
         "if the default Maven Central repo is unreachable.")
       .stringConf
       .createWithDefault(
-        "https://maven-central.storage-download.googleapis.com/repos/central/data/")
+        "https://maven-central.storage-download.googleapis.com/maven2/")
 
   val LEGACY_FROM_DAYTIME_STRING =
     buildConf("spark.sql.legacy.fromDayTimeString.enabled")

From e45f2c7fc0b3a05a5d925fbe253532ce47a741f2 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Tue, 25 Feb 2020 17:37:34 +0900
Subject: [PATCH 141/185] [SPARK-28228][SQL][TESTS] Refactoring for nested CTE
 tests

### What changes were proposed in this pull request?
Split the nested CTE cases into a single file `cte-nested.sql`, which will be reused in cte-legacy.sql and cte-nonlegacy.sql.

### Why are the changes needed?
Make the cases easy to maintain.

### Does this PR introduce any user-facing change?
No.

### How was this patch tested?
Existing UT.

Closes #27667 from xuanyuanking/SPARK-28228-test.

Authored-by: Yuanjian Li <xyliyuanjian@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../resources/sql-tests/inputs/cte-legacy.sql | 117 +-----------
 .../resources/sql-tests/inputs/cte-nested.sql | 105 +++++++++++
 .../sql-tests/inputs/cte-nonlegacy.sql        |   2 +-
 .../test/resources/sql-tests/inputs/cte.sql   | 106 -----------
 .../sql-tests/results/cte-legacy.sql.out      |  42 +----
 .../sql-tests/results/cte-nested.sql.out      | 174 +++++++++++++++++
 .../sql-tests/results/cte-nonlegacy.sql.out   | 177 +-----------------
 .../resources/sql-tests/results/cte.sql.out   | 174 +----------------
 8 files changed, 285 insertions(+), 612 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
index d8754d3023965..29dee1a3afd38 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql
@@ -1,115 +1,2 @@
-create temporary view t as select * from values 0, 1, 2 as t(id);
-create temporary view t2 as select * from values 0, 1 as t(id);
-
--- CTE legacy substitution
-SET spark.sql.legacy.ctePrecedencePolicy=legacy;
-
--- CTE in CTE definition
-WITH t as (
-  WITH t2 AS (SELECT 1)
-  SELECT * FROM t2
-)
-SELECT * FROM t;
-
--- CTE in subquery
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 1)
-  SELECT * FROM t
-);
-
--- CTE in subquery expression
-SELECT (
-  WITH t AS (SELECT 1)
-  SELECT * FROM t
-);
-
--- CTE in CTE definition shadows outer
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-SELECT * FROM t2;
-
--- CTE in CTE definition shadows outer 2
-WITH
-  t(c) AS (SELECT 1),
-  t2 AS (
-    SELECT (
-      SELECT max(c) FROM (
-        WITH t(c) AS (SELECT 2)
-        SELECT * FROM t
-      )
-    )
-  )
-SELECT * FROM t2;
-
--- CTE in CTE definition shadows outer 3
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2),
-    t2 AS (
-      WITH t AS (SELECT 3)
-      SELECT * FROM t
-    )
-    SELECT * FROM t2
-  )
-SELECT * FROM t2;
-
--- CTE in subquery shadows outer
-WITH t(c) AS (SELECT 1)
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT * FROM t
-);
-
--- CTE in subquery shadows outer 2
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 2)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery shadows outer 3
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 3)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery expression shadows outer
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT * FROM t
-);
-
--- CTE in subquery expression shadows outer 2
-WITH t AS (SELECT 1)
-SELECT (
-  SELECT (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery expression shadows outer 3
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT (
-    WITH t AS (SELECT 3)
-    SELECT * FROM t
-  )
-);
-
--- Clean up
-DROP VIEW IF EXISTS t;
-DROP VIEW IF EXISTS t2;
+--SET spark.sql.legacy.ctePrecedencePolicy = legacy
+--IMPORT cte-nested.sql
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
new file mode 100644
index 0000000000000..5e5e3a51f1f5d
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -0,0 +1,105 @@
+-- CTE in CTE definition
+WITH t as (
+  WITH t2 AS (SELECT 1)
+  SELECT * FROM t2
+)
+SELECT * FROM t;
+
+-- CTE in subquery
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 1)
+  SELECT * FROM t
+);
+
+-- CTE in subquery expression
+SELECT (
+  WITH t AS (SELECT 1)
+  SELECT * FROM t
+);
+
+-- CTE in CTE definition shadows outer
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 2
+WITH
+  t(c) AS (SELECT 1),
+  t2 AS (
+    SELECT (
+      SELECT max(c) FROM (
+        WITH t(c) AS (SELECT 2)
+        SELECT * FROM t
+      )
+    )
+  )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 3
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2),
+    t2 AS (
+      WITH t AS (SELECT 3)
+      SELECT * FROM t
+    )
+    SELECT * FROM t2
+  )
+SELECT * FROM t2;
+
+-- CTE in subquery shadows outer
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+);
+
+-- CTE in subquery shadows outer 2
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 2)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery shadows outer 3
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 3)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery expression shadows outer
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT * FROM t
+);
+
+-- CTE in subquery expression shadows outer 2
+WITH t AS (SELECT 1)
+SELECT (
+  SELECT (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery expression shadows outer 3
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT (
+    WITH t AS (SELECT 3)
+    SELECT * FROM t
+  )
+);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
index 8b9ce8911b133..d3b4539da2f3e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql
@@ -1,2 +1,2 @@
 --SET spark.sql.legacy.ctePrecedencePolicy = corrected
---IMPORT cte.sql
+--IMPORT cte-nested.sql
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index d0e145c35a9fe..ec2f4808fcfc9 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -49,112 +49,6 @@ WITH
   t(x) AS (SELECT 2)
 SELECT * FROM t;
 
--- CTE in CTE definition
-WITH t as (
-  WITH t2 AS (SELECT 1)
-  SELECT * FROM t2
-)
-SELECT * FROM t;
-
--- CTE in subquery
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 1)
-  SELECT * FROM t
-);
-
--- CTE in subquery expression
-SELECT (
-  WITH t AS (SELECT 1)
-  SELECT * FROM t
-);
-
--- CTE in CTE definition shadows outer
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-SELECT * FROM t2;
-
--- CTE in CTE definition shadows outer 2
-WITH
-  t(c) AS (SELECT 1),
-  t2 AS (
-    SELECT (
-      SELECT max(c) FROM (
-        WITH t(c) AS (SELECT 2)
-        SELECT * FROM t
-      )
-    )
-  )
-SELECT * FROM t2;
-
--- CTE in CTE definition shadows outer 3
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2),
-    t2 AS (
-      WITH t AS (SELECT 3)
-      SELECT * FROM t
-    )
-    SELECT * FROM t2
-  )
-SELECT * FROM t2;
-
--- CTE in subquery shadows outer
-WITH t(c) AS (SELECT 1)
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT * FROM t
-);
-
--- CTE in subquery shadows outer 2
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 2)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery shadows outer 3
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 3)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery expression shadows outer
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT * FROM t
-);
-
--- CTE in subquery expression shadows outer 2
-WITH t AS (SELECT 1)
-SELECT (
-  SELECT (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-);
-
--- CTE in subquery expression shadows outer 3
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT (
-    WITH t AS (SELECT 3)
-    SELECT * FROM t
-  )
-);
-
 -- Clean up
 DROP VIEW IF EXISTS t;
 DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
index a20ae55b1b876..e8020e1a454e6 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,29 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 17
-
-
--- !query
-create temporary view t as select * from values 0, 1, 2 as t(id)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-create temporary view t2 as select * from values 0, 1 as t(id)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-SET spark.sql.legacy.ctePrecedencePolicy=legacy
--- !query schema
-struct<key:string,value:string>
--- !query output
-spark.sql.legacy.ctePrecedencePolicy	legacy
+-- Number of queries: 12
 
 
 -- !query
@@ -190,19 +166,3 @@ SELECT (
 struct<scalarsubquery():int>
 -- !query output
 1
-
-
--- !query
-DROP VIEW IF EXISTS t
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-DROP VIEW IF EXISTS t2
--- !query schema
-struct<>
--- !query output
-
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
new file mode 100644
index 0000000000000..64d635adae428
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -0,0 +1,174 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 12
+
+
+-- !query
+WITH t as (
+  WITH t2 AS (SELECT 1)
+  SELECT * FROM t2
+)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 1)
+  SELECT * FROM t
+)
+-- !query schema
+struct<max(c):int>
+-- !query output
+1
+
+
+-- !query
+SELECT (
+  WITH t AS (SELECT 1)
+  SELECT * FROM t
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+SELECT * FROM t2
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+  t(c) AS (SELECT 1),
+  t2 AS (
+    SELECT (
+      SELECT max(c) FROM (
+        WITH t(c) AS (SELECT 2)
+        SELECT * FROM t
+      )
+    )
+  )
+SELECT * FROM t2
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2),
+    t2 AS (
+      WITH t AS (SELECT 3)
+      SELECT * FROM t
+    )
+    SELECT * FROM t2
+  )
+SELECT * FROM t2
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query schema
+struct<max(c):int>
+-- !query output
+2
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 2)
+    SELECT * FROM t
+  )
+)
+-- !query schema
+struct<sum(c):bigint>
+-- !query output
+2
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 3)
+    SELECT * FROM t
+  )
+)
+-- !query schema
+struct<sum(c):bigint>
+-- !query output
+3
+
+
+-- !query
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH t AS (SELECT 1)
+SELECT (
+  SELECT (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT (
+    WITH t AS (SELECT 3)
+    SELECT * FROM t
+  )
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
index 2d87781193c25..9422bb642d96f 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,164 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 27
-
-
--- !query
-create temporary view t as select * from values 0, 1, 2 as t(id)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-create temporary view t2 as select * from values 0, 1 as t(id)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-WITH s AS (SELECT 1 FROM s) SELECT * FROM s
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Table or view not found: s; line 1 pos 25
-
-
--- !query
-WITH r AS (SELECT (SELECT * FROM r))
-SELECT * FROM r
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Table or view not found: r; line 1 pos 33
-
-
--- !query
-WITH t AS (SELECT 1 FROM t) SELECT * FROM t
--- !query schema
-struct<1:int>
--- !query output
-1
-1
-1
-
-
--- !query
-WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Table or view not found: s2; line 1 pos 26
-
-
--- !query
-WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2
--- !query schema
-struct<id:int,2:int>
--- !query output
-0	2
-0	2
-1	2
-1	2
-
-
--- !query
-WITH CTE1 AS (
-  SELECT b.id AS id
-  FROM   T2 a
-         CROSS JOIN (SELECT id AS id FROM T2) b
-)
-SELECT t1.id AS c1,
-       t2.id AS c2
-FROM   CTE1 t1
-       CROSS JOIN CTE1 t2
--- !query schema
-struct<c1:int,c2:int>
--- !query output
-0	0
-0	0
-0	0
-0	0
-0	1
-0	1
-0	1
-0	1
-1	0
-1	0
-1	0
-1	0
-1	1
-1	1
-1	1
-1	1
-
-
--- !query
-WITH t(x) AS (SELECT 1)
-SELECT * FROM t WHERE x = 1
--- !query schema
-struct<x:int>
--- !query output
-1
-
-
--- !query
-WITH t(x, y) AS (SELECT 1, 2)
-SELECT * FROM t WHERE x = 1 AND y = 2
--- !query schema
-struct<x:int,y:int>
--- !query output
-1	2
-
-
--- !query
-WITH t(x, x) AS (SELECT 1, 2)
-SELECT * FROM t
--- !query schema
-struct<x:int,x:int>
--- !query output
-1	2
-
-
--- !query
-WITH t() AS (SELECT 1)
-SELECT * FROM t
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-no viable alternative at input 'WITH t()'(line 1, pos 7)
-
-== SQL ==
-WITH t() AS (SELECT 1)
--------^^^
-SELECT * FROM t
-
-
--- !query
-WITH
-  t(x) AS (SELECT 1),
-  t(x) AS (SELECT 2)
-SELECT * FROM t
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-CTE definition can't have duplicate names: 't'.(line 1, pos 0)
-
-== SQL ==
-WITH
-^^^
-  t(x) AS (SELECT 1),
-  t(x) AS (SELECT 2)
-SELECT * FROM t
+-- Number of queries: 12
 
 
 -- !query
@@ -325,19 +166,3 @@ SELECT (
 struct<scalarsubquery():int>
 -- !query output
 3
-
-
--- !query
-DROP VIEW IF EXISTS t
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-DROP VIEW IF EXISTS t2
--- !query schema
-struct<>
--- !query output
-
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index 6486fba18d20f..b8f666586ce45 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 27
+-- Number of queries: 15
 
 
 -- !query
@@ -161,178 +161,6 @@ WITH
 SELECT * FROM t
 
 
--- !query
-WITH t as (
-  WITH t2 AS (SELECT 1)
-  SELECT * FROM t2
-)
-SELECT * FROM t
--- !query schema
-struct<1:int>
--- !query output
-1
-
-
--- !query
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 1)
-  SELECT * FROM t
-)
--- !query schema
-struct<max(c):int>
--- !query output
-1
-
-
--- !query
-SELECT (
-  WITH t AS (SELECT 1)
-  SELECT * FROM t
-)
--- !query schema
-struct<scalarsubquery():int>
--- !query output
-1
-
-
--- !query
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-SELECT * FROM t2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
--- !query
-WITH
-  t(c) AS (SELECT 1),
-  t2 AS (
-    SELECT (
-      SELECT max(c) FROM (
-        WITH t(c) AS (SELECT 2)
-        SELECT * FROM t
-      )
-    )
-  )
-SELECT * FROM t2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
--- !query
-WITH
-  t AS (SELECT 1),
-  t2 AS (
-    WITH t AS (SELECT 2),
-    t2 AS (
-      WITH t AS (SELECT 3)
-      SELECT * FROM t
-    )
-    SELECT * FROM t2
-  )
-SELECT * FROM t2
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
--- !query
-WITH t(c) AS (SELECT 1)
-SELECT max(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT * FROM t
-)
--- !query schema
-struct<max(c):int>
--- !query output
-2
-
-
--- !query
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 2)
-    SELECT * FROM t
-  )
-)
--- !query schema
-struct<sum(c):bigint>
--- !query output
-2
-
-
--- !query
-WITH t(c) AS (SELECT 1)
-SELECT sum(c) FROM (
-  WITH t(c) AS (SELECT 2)
-  SELECT max(c) AS c FROM (
-    WITH t(c) AS (SELECT 3)
-    SELECT * FROM t
-  )
-)
--- !query schema
-struct<sum(c):bigint>
--- !query output
-3
-
-
--- !query
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT * FROM t
-)
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
--- !query
-WITH t AS (SELECT 1)
-SELECT (
-  SELECT (
-    WITH t AS (SELECT 2)
-    SELECT * FROM t
-  )
-)
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
--- !query
-WITH t AS (SELECT 1)
-SELECT (
-  WITH t AS (SELECT 2)
-  SELECT (
-    WITH t AS (SELECT 3)
-    SELECT * FROM t
-  )
-)
--- !query schema
-struct<>
--- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
-
-
 -- !query
 DROP VIEW IF EXISTS t
 -- !query schema

From 761209c1f2af513a9db2e08c5937531cff7aeeed Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Tue, 25 Feb 2020 22:19:24 +0800
Subject: [PATCH 142/185] [SPARK-30919][SQL] Make interval multiply and
 divide's overflow behavior consistent with other operations

### What changes were proposed in this pull request?

The current behavior of interval multiply and divide follows the ANSI SQL standard when overflow, it is compatible with other operations when `spark.sql.ansi.enabled` is true, but not compatible when `spark.sql.ansi.enabled` is false.

When `spark.sql.ansi.enabled` is false, as the factor is a double value, so it should use java's rounding or truncation behavior for casting double to integrals. when divided by zero, it returns `null`.  we also follow the natural rules for intervals as defined in the Gregorian calendar, so we do not add the month fraction to days but add days fraction to microseconds.

### Why are the changes needed?

Make interval multiply and divide's overflow behavior consistent with other interval operations

### Does this PR introduce any user-facing change?

no, these are new features in 3.0

### How was this patch tested?

add uts

Closes #27672 from yaooqinn/SPARK-30919.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/intervalExpressions.scala     | 34 ++++++++---
 .../sql/catalyst/util/IntervalUtils.scala     | 45 +++++++++++---
 .../IntervalExpressionsSuite.scala            | 37 +++++++----
 .../catalyst/util/IntervalUtilsSuite.scala    | 61 ++++++++++++-------
 .../resources/sql-tests/inputs/interval.sql   |  4 ++
 .../sql-tests/results/ansi/interval.sql.out   | 38 +++++++++++-
 .../sql-tests/results/interval.sql.out        | 49 ++++++++++++---
 7 files changed, 210 insertions(+), 58 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala
index 831510e7f0f3e..c09350f33c7f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala
@@ -22,6 +22,7 @@ import java.util.Locale
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.catalyst.util.IntervalUtils._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -112,13 +113,14 @@ object ExtractIntervalPart {
 
 abstract class IntervalNumOperation(
     interval: Expression,
-    num: Expression,
-    operation: (CalendarInterval, Double) => CalendarInterval,
-    operationName: String)
+    num: Expression)
   extends BinaryExpression with ImplicitCastInputTypes with Serializable {
   override def left: Expression = interval
   override def right: Expression = num
 
+  protected val operation: (CalendarInterval, Double) => CalendarInterval
+  protected def operationName: String
+
   override def inputTypes: Seq[AbstractDataType] = Seq(CalendarIntervalType, DoubleType)
   override def dataType: DataType = CalendarIntervalType
 
@@ -136,11 +138,29 @@ abstract class IntervalNumOperation(
   override def prettyName: String = operationName.stripSuffix("Exact") + "_interval"
 }
 
-case class MultiplyInterval(interval: Expression, num: Expression)
-  extends IntervalNumOperation(interval, num, multiplyExact, "multiplyExact")
+case class MultiplyInterval(
+    interval: Expression,
+    num: Expression,
+    checkOverflow: Boolean = SQLConf.get.ansiEnabled)
+  extends IntervalNumOperation(interval, num) {
+
+  override protected val operation: (CalendarInterval, Double) => CalendarInterval =
+    if (checkOverflow) multiplyExact else multiply
+
+  override protected def operationName: String = if (checkOverflow) "multiplyExact" else "multiply"
+}
+
+case class DivideInterval(
+    interval: Expression,
+    num: Expression,
+    checkOverflow: Boolean = SQLConf.get.ansiEnabled)
+  extends IntervalNumOperation(interval, num) {
+
+  override protected val operation: (CalendarInterval, Double) => CalendarInterval =
+    if (checkOverflow) divideExact else divide
 
-case class DivideInterval(interval: Expression, num: Expression)
-  extends IntervalNumOperation(interval, num, divideExact, "divideExact")
+  override protected def operationName: String = if (checkOverflow) "divideExact" else "divide"
+}
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
index 2790f8e998358..0a13ec87c2897 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
@@ -405,22 +405,38 @@ object IntervalUtils {
   }
 
   /**
-   * Makes an interval from months, days and micros with the fractional part by
-   * adding the month fraction to days and the days fraction to micros.
-   *
-   * @throws ArithmeticException if the result overflows any field value
+   * Makes an interval from months, days and micros with the fractional part.
+   * The overflow style here follows the way of ansi sql standard and the natural rules for
+   * intervals as defined in the Gregorian calendar. Thus, the days fraction will be added
+   * to microseconds but the months fraction will not be added to days, and it will throw
+   * exception if any part overflows.
    */
   private def fromDoubles(
       monthsWithFraction: Double,
       daysWithFraction: Double,
       microsWithFraction: Double): CalendarInterval = {
     val truncatedMonths = Math.toIntExact(monthsWithFraction.toLong)
-    val days = daysWithFraction + DAYS_PER_MONTH * (monthsWithFraction - truncatedMonths)
-    val truncatedDays = Math.toIntExact(days.toLong)
-    val micros = microsWithFraction + MICROS_PER_DAY * (days - truncatedDays)
+    val truncatedDays = Math.toIntExact(daysWithFraction.toLong)
+    val micros = microsWithFraction + MICROS_PER_DAY * (daysWithFraction - truncatedDays)
     new CalendarInterval(truncatedMonths, truncatedDays, micros.round)
   }
 
+  /**
+   * Makes an interval from months, days and micros with the fractional part.
+   * The overflow style here follows the way of casting [[java.lang.Double]] to integrals and the
+   * natural rules for intervals as defined in the Gregorian calendar. Thus, the days fraction
+   * will be added to microseconds but the months fraction will not be added to days, and there may
+   * be rounding or truncation in months(or day and microseconds) part.
+   */
+  private def safeFromDoubles(
+      monthsWithFraction: Double,
+      daysWithFraction: Double,
+      microsWithFraction: Double): CalendarInterval = {
+    val truncatedDays = daysWithFraction.toInt
+    val micros = microsWithFraction + MICROS_PER_DAY * (daysWithFraction - truncatedDays)
+    new CalendarInterval(monthsWithFraction.toInt, truncatedDays, micros.round)
+  }
+
   /**
    * Unary minus, return the negated the calendar interval value.
    *
@@ -484,6 +500,13 @@ object IntervalUtils {
     new CalendarInterval(months, days, microseconds)
   }
 
+  /**
+   * Return a new calendar interval instance of the left interval times a multiplier.
+   */
+  def multiply(interval: CalendarInterval, num: Double): CalendarInterval = {
+    safeFromDoubles(num * interval.months, num * interval.days, num * interval.microseconds)
+  }
+
   /**
    * Return a new calendar interval instance of the left interval times a multiplier.
    *
@@ -493,6 +516,14 @@ object IntervalUtils {
     fromDoubles(num * interval.months, num * interval.days, num * interval.microseconds)
   }
 
+  /**
+   * Return a new calendar interval instance of the left interval divides by a dividend.
+   */
+  def divide(interval: CalendarInterval, num: Double): CalendarInterval = {
+    if (num == 0) return null
+    safeFromDoubles(interval.months / num, interval.days / num, interval.microseconds / num)
+  }
+
   /**
    * Return a new calendar interval instance of the left interval divides by a dividend.
    *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
index bc5743811c975..e591c4984a7bf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
@@ -199,11 +199,16 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("multiply") {
-    def check(interval: String, num: Double, expected: String): Unit = {
-      val expr = MultiplyInterval(Literal(stringToInterval(interval)), Literal(num))
+    def check(
+        interval: String,
+        num: Double,
+        expected: String,
+        isAnsi: Option[Boolean] = None): Unit = {
       val expectedRes = safeStringToInterval(expected)
-      Seq("true", "false").foreach { v =>
+      val configs = if (isAnsi.isEmpty) Seq("true", "false") else isAnsi.map(_.toString).toSeq
+      configs.foreach { v =>
         withSQLConf(SQLConf.ANSI_ENABLED.key -> v) {
+          val expr = MultiplyInterval(Literal(stringToInterval(interval)), Literal(num))
           if (expectedRes == null) {
             checkExceptionInExpression[ArithmeticException](expr, expected)
           } else {
@@ -220,17 +225,23 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     check("1 year 1 second", 0.5, "6 months 500 milliseconds")
     check("-100 years -1 millisecond", 0.5, "-50 years -500 microseconds")
     check("2 months 4 seconds", -0.5, "-1 months -2 seconds")
-    check("1 month 2 microseconds", 1.5, "1 months 15 days 3 microseconds")
-    check("2 months", Int.MaxValue, "integer overflow")
+    check("1 month 2 microseconds", 1.5, "1 months 3 microseconds")
+    check("2 months", Int.MaxValue, "integer overflow", Some(true))
+    check("2 months", Int.MaxValue, Int.MaxValue + " months", Some(false))
   }
 
   test("divide") {
-    def check(interval: String, num: Double, expected: String): Unit = {
-      val expr = DivideInterval(Literal(stringToInterval(interval)), Literal(num))
+    def check(
+        interval: String,
+        num: Double,
+        expected: String,
+        isAnsi: Option[Boolean] = None): Unit = {
       val expectedRes = safeStringToInterval(expected)
-      Seq("true", "false").foreach { v =>
+      val configs = if (isAnsi.isEmpty) Seq("true", "false") else isAnsi.map(_.toString).toSeq
+      configs.foreach { v =>
         withSQLConf(SQLConf.ANSI_ENABLED.key -> v) {
-          if (expectedRes == null) {
+          val expr = DivideInterval(Literal(stringToInterval(interval)), Literal(num))
+          if (expected != null && expectedRes == null) {
             checkExceptionInExpression[ArithmeticException](expr, expected)
           } else {
             checkEvaluation(expr, expectedRes)
@@ -245,9 +256,11 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     check("6 years -7 seconds", 3, "2 years -2.333333 seconds")
     check("2 years -8 seconds", 0.5, "4 years -16 seconds")
     check("-1 month 2 microseconds", -0.25, "4 months -8 microseconds")
-    check("1 month 3 microsecond", 1.5, "20 days 2 microseconds")
-    check("1 second", 0, "divide by zero")
-    check(s"${Int.MaxValue} months", 0.9, "integer overflow")
+    check("1 month 3 microsecond", 1.5, "2 microseconds")
+    check("1 second", 0, "divide by zero", Some(true))
+    check("1 second", 0, null, Some(false))
+    check(s"${Int.MaxValue} months", 0.9, "integer overflow", Some(true))
+    check(s"${Int.MaxValue} months", 0.9, Int.MaxValue + " months", Some(false))
   }
 
   test("make interval") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
index d0d79acd73883..e7c3163d0ac09 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
@@ -268,19 +268,23 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("multiply by num") {
-    var interval = new CalendarInterval(0, 0, 0)
-    assert(interval === multiplyExact(interval, 0))
-    interval = new CalendarInterval(123, 456, 789)
-    assert(new CalendarInterval(123 * 42, 456 * 42, 789 * 42) === multiplyExact(interval, 42))
-    interval = new CalendarInterval(-123, -456, -789)
-    assert(new CalendarInterval(-123 * 42, -456 * 42, -789 * 42) === multiplyExact(interval, 42))
-    assert(new CalendarInterval(1, 22, 12 * MICROS_PER_HOUR) ===
-      multiplyExact(new CalendarInterval(1, 5, 0), 1.5))
-    assert(new CalendarInterval(2, 14, 12 * MICROS_PER_HOUR) ===
-      multiplyExact(new CalendarInterval(2, 2, 2 * MICROS_PER_HOUR), 1.2))
+    Seq[(CalendarInterval, Double) => CalendarInterval](multiply, multiplyExact).foreach { func =>
+      var interval = new CalendarInterval(0, 0, 0)
+      assert(interval === func(interval, 0))
+      interval = new CalendarInterval(123, 456, 789)
+      assert(new CalendarInterval(123 * 42, 456 * 42, 789 * 42) === func(interval, 42))
+      interval = new CalendarInterval(-123, -456, -789)
+      assert(new CalendarInterval(-123 * 42, -456 * 42, -789 * 42) === func(interval, 42))
+      interval = new CalendarInterval(1, 5, 0)
+      assert(new CalendarInterval(1, 7, 12 * MICROS_PER_HOUR) === func(interval, 1.5))
+      interval = new CalendarInterval(2, 2, 2 * MICROS_PER_HOUR)
+      assert(new CalendarInterval(2, 2, 12 * MICROS_PER_HOUR) === func(interval, 1.2))
+    }
 
+    val interval = new CalendarInterval(2, 0, 0)
+    assert(multiply(interval, Integer.MAX_VALUE) === new CalendarInterval(Int.MaxValue, 0, 0))
     try {
-      multiplyExact(new CalendarInterval(2, 0, 0), Integer.MAX_VALUE)
+      multiplyExact(interval, Integer.MAX_VALUE)
       fail("Expected to throw an exception on months overflow")
     } catch {
       case e: ArithmeticException => assert(e.getMessage.contains("overflow"))
@@ -288,17 +292,32 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("divide by num") {
-    var interval = new CalendarInterval(0, 0, 0)
-    assert(interval === divideExact(interval, 10))
-    interval = new CalendarInterval(1, 3, 30 * MICROS_PER_SECOND)
-    assert(new CalendarInterval(0, 16, 12 * MICROS_PER_HOUR + 15 * MICROS_PER_SECOND) ===
-      divideExact(interval, 2))
-    assert(new CalendarInterval(2, 6, MICROS_PER_MINUTE) === divideExact(interval, 0.5))
-    interval = new CalendarInterval(-1, 0, -30 * MICROS_PER_SECOND)
-    assert(new CalendarInterval(0, -15, -15 * MICROS_PER_SECOND) === divideExact(interval, 2))
-    assert(new CalendarInterval(-2, 0, -1 * MICROS_PER_MINUTE) === divideExact(interval, 0.5))
+    Seq[(CalendarInterval, Double) => CalendarInterval](divide, divideExact).foreach { func =>
+      var interval = new CalendarInterval(0, 0, 0)
+      assert(interval === func(interval, 10))
+      interval = new CalendarInterval(1, 3, 30 * MICROS_PER_SECOND)
+      assert(new CalendarInterval(0, 1, 12 * MICROS_PER_HOUR + 15 * MICROS_PER_SECOND) ===
+        func(interval, 2))
+      assert(new CalendarInterval(2, 6, MICROS_PER_MINUTE) === func(interval, 0.5))
+      interval = new CalendarInterval(-1, 0, -30 * MICROS_PER_SECOND)
+      assert(new CalendarInterval(0, 0, -15 * MICROS_PER_SECOND) === func(interval, 2))
+      assert(new CalendarInterval(-2, 0, -MICROS_PER_MINUTE) === func(interval, 0.5))
+    }
+
+    var interval = new CalendarInterval(Int.MaxValue, Int.MaxValue, 0)
+    assert(divide(interval, 0.9) === new CalendarInterval(Int.MaxValue, Int.MaxValue,
+      ((Int.MaxValue / 9.0) * MICROS_PER_DAY).round))
+    try {
+      divideExact(interval, 0.9)
+      fail("Expected to throw an exception on integer overflow")
+    } catch {
+      case e: ArithmeticException => assert(e.getMessage.contains("integer overflow"))
+    }
+
+    interval = new CalendarInterval(123, 456, 789)
+    assert(divide(interval, 0) === null)
     try {
-      divideExact(new CalendarInterval(123, 456, 789), 0)
+      divideExact(interval, 0)
       fail("Expected to throw an exception on divide by zero")
     } catch {
       case e: ArithmeticException => assert(e.getMessage.contains("divide by zero"))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
index facd6321a1bd4..4f26e75fa77f8 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
@@ -4,6 +4,10 @@
 select 3 * (timestamp'2019-10-15 10:11:12.001002' - date'2019-10-15');
 select interval 4 month 2 weeks 3 microseconds * 1.5;
 select (timestamp'2019-10-15' - timestamp'2019-10-14') / 1.5;
+select interval 2147483647 month * 2;
+select interval 2147483647 month / 0.5;
+select interval 2147483647 day * 2;
+select interval 2147483647 day / 0.5;
 
 -- interval operation with null and zero case
 select interval '2 seconds' / 0;
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index 4b465406ee0ed..4a41dd669e1cc 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 81
+-- Number of queries: 85
 
 
 -- !query
@@ -26,6 +26,42 @@ struct<divide_interval(subtracttimestamps(TIMESTAMP '2019-10-15 00:00:00', TIMES
 16 hours
 
 
+-- !query
+select interval 2147483647 month * 2
+-- !query schema
+struct<>
+-- !query output
+java.lang.ArithmeticException
+integer overflow
+
+
+-- !query
+select interval 2147483647 month / 0.5
+-- !query schema
+struct<>
+-- !query output
+java.lang.ArithmeticException
+integer overflow
+
+
+-- !query
+select interval 2147483647 day * 2
+-- !query schema
+struct<>
+-- !query output
+java.lang.ArithmeticException
+integer overflow
+
+
+-- !query
+select interval 2147483647 day / 0.5
+-- !query schema
+struct<>
+-- !query output
+java.lang.ArithmeticException
+integer overflow
+
+
 -- !query
 select interval '2 seconds' / 0
 -- !query schema
diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
index 0509594ac13d3..f1af335f2c78f 100644
--- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 81
+-- Number of queries: 85
 
 
 -- !query
@@ -26,13 +26,44 @@ struct<divide_interval(subtracttimestamps(TIMESTAMP '2019-10-15 00:00:00', TIMES
 16 hours
 
 
+-- !query
+select interval 2147483647 month * 2
+-- !query schema
+struct<multiply_interval(INTERVAL '178956970 years 7 months', CAST(2 AS DOUBLE)):interval>
+-- !query output
+178956970 years 7 months
+
+
+-- !query
+select interval 2147483647 month / 0.5
+-- !query schema
+struct<divide_interval(INTERVAL '178956970 years 7 months', CAST(0.5 AS DOUBLE)):interval>
+-- !query output
+178956970 years 7 months
+
+
+-- !query
+select interval 2147483647 day * 2
+-- !query schema
+struct<multiply_interval(INTERVAL '2147483647 days', CAST(2 AS DOUBLE)):interval>
+-- !query output
+2147483647 days 2562047788 hours 54.775807 seconds
+
+
+-- !query
+select interval 2147483647 day / 0.5
+-- !query schema
+struct<divide_interval(INTERVAL '2147483647 days', CAST(0.5 AS DOUBLE)):interval>
+-- !query output
+2147483647 days 2562047788 hours 54.775807 seconds
+
+
 -- !query
 select interval '2 seconds' / 0
 -- !query schema
-struct<>
+struct<divide_interval(INTERVAL '2 seconds', CAST(0 AS DOUBLE)):interval>
 -- !query output
-java.lang.ArithmeticException
-divide by zero
+NULL
 
 
 -- !query
@@ -779,19 +810,17 @@ struct<(b + INTERVAL '1 months'):interval>
 -- !query
 select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
 -- !query schema
-struct<>
+struct<multiply_interval(a, CAST(1.1 AS DOUBLE)):interval>
 -- !query output
-java.lang.ArithmeticException
-integer overflow
+-178956970 years -8 months
 
 
 -- !query
 select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
 -- !query schema
-struct<>
+struct<divide_interval(a, CAST(0.5 AS DOUBLE)):interval>
 -- !query output
-java.lang.ArithmeticException
-integer overflow
+-178956970 years -8 months
 
 
 -- !query

From ffc0935e64ddd56c40db1c1c7f3afb7e5e306861 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <max.gekk@gmail.com>
Date: Tue, 25 Feb 2020 23:05:28 +0800
Subject: [PATCH 143/185] [SPARK-30869][SQL] Convert dates to/from timestamps
 in microseconds precision

### What changes were proposed in this pull request?
In the PR, I propose to replace:

1. `millisToDays()` by `microsToDays()` which accepts microseconds since the epoch and returns days since the epoch in the specified time zone. The last one is the internal representation of Catalyst's DateType.
2. `daysToMillis()` by `daysToMicros()` which accepts days since the epoch in some time zone and returns the number of microseconds since the epoch. The last one is internal representation of Catalyst's TimestampType.
3. `fromMillis()` by `millisToMicros()`
4. `toMillis()` by `microsToMillis()`

### Why are the changes needed?
Spark stores timestamps in microseconds precision, so, there is no actual need to convert dates to milliseconds, and then to microseconds. As examples, look at DateTimeUtils functions `monthsBetween()` and `truncTimestamp()`.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
By existing test suites UnivocityParserSuite, DateExpressionsSuite, ComputeCurrentTimeSuite, DateTimeUtilsSuite, DateFunctionsSuite, JsonSuite, StreamSuite.

Closes #27618 from MaxGekk/replace-millis-by-micros.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/datetimeExpressions.scala     | 10 ++--
 .../sql/catalyst/util/DateFormatter.scala     |  7 ++-
 .../sql/catalyst/util/DateTimeUtils.scala     | 58 ++++++++-----------
 .../sql/catalyst/util/IntervalUtils.scala     |  4 +-
 .../catalyst/util/TimestampFormatter.scala    |  4 +-
 .../catalyst/csv/UnivocityParserSuite.scala   |  4 +-
 .../sql/catalyst/expressions/CastSuite.scala  |  4 +-
 .../expressions/DateExpressionsSuite.scala    | 26 ++++-----
 .../optimizer/ComputeCurrentTimeSuite.scala   |  6 +-
 .../catalyst/util/DateTimeUtilsSuite.scala    | 29 +++++-----
 .../catalyst/util/IntervalUtilsSuite.scala    |  6 +-
 .../parquet/VectorizedColumnReader.java       |  4 +-
 .../binaryfile/BinaryFileFormat.scala         |  2 +-
 .../parquet/ParquetRowConverter.scala         |  2 +-
 .../parquet/ParquetWriteSupport.scala         |  2 +-
 .../streaming/EventTimeWatermarkExec.scala    |  4 +-
 .../sql/execution/streaming/Triggers.scala    |  4 +-
 .../ContinuousRateStreamSource.scala          |  2 +-
 .../sources/RateStreamMicroBatchStream.scala  |  2 +-
 .../sources/TextSocketMicroBatchStream.scala  |  2 +-
 .../apache/spark/sql/DateFunctionsSuite.scala |  6 +-
 .../sql/StatisticsCollectionTestBase.scala    |  4 +-
 .../datasources/json/JsonSuite.scala          |  2 +-
 .../spark/sql/streaming/StreamSuite.scala     |  3 +-
 24 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 05074d97efa44..767dacfde073c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -131,10 +131,12 @@ case class CurrentBatchTimestamp(
    */
   override protected def evalInternal(input: InternalRow): Any = toLiteral.value
 
-  def toLiteral: Literal = dataType match {
-    case _: TimestampType =>
-      Literal(DateTimeUtils.fromJavaTimestamp(new Timestamp(timestampMs)), TimestampType)
-    case _: DateType => Literal(DateTimeUtils.millisToDays(timestampMs, zoneId), DateType)
+  def toLiteral: Literal = {
+    val timestampUs = millisToMicros(timestampMs)
+    dataType match {
+      case _: TimestampType => Literal(timestampUs, TimestampType)
+      case _: DateType => Literal(microsToDays(timestampUs, zoneId), DateType)
+    }
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 2cf82d1cfa177..941c8fcccd1a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -23,7 +23,8 @@ import java.util.{Date, Locale}
 
 import org.apache.commons.lang3.time.FastDateFormat
 
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
+import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 
 sealed trait DateFormatter extends Serializable {
@@ -57,8 +58,8 @@ trait LegacyDateFormatter extends DateFormatter {
   def formatDate(d: Date): String
 
   override def parse(s: String): Int = {
-    val milliseconds = parseToDate(s).getTime
-    DateTimeUtils.millisToDays(milliseconds)
+    val micros = DateTimeUtils.millisToMicros(parseToDate(s).getTime)
+    DateTimeUtils.microsToDays(micros)
   }
 
   override def format(days: Int): String = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 2e4daa20ad51b..731aa3502753f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -59,24 +59,22 @@ object DateTimeUtils {
     TimeZone.getTimeZone(getZoneId(timeZoneId))
   }
 
-  // we should use the exact day as Int, for example, (year, month, day) -> day
-  def millisToDays(millisUtc: Long): SQLDate = {
-    millisToDays(millisUtc, defaultTimeZone().toZoneId)
+  def microsToDays(timestamp: SQLTimestamp): SQLDate = {
+    microsToDays(timestamp, defaultTimeZone().toZoneId)
   }
 
-  def millisToDays(millisUtc: Long, zoneId: ZoneId): SQLDate = {
-    val instant = microsToInstant(fromMillis(millisUtc))
+  def microsToDays(timestamp: SQLTimestamp, zoneId: ZoneId): SQLDate = {
+    val instant = microsToInstant(timestamp)
     localDateToDays(LocalDateTime.ofInstant(instant, zoneId).toLocalDate)
   }
 
-  // reverse of millisToDays
-  def daysToMillis(days: SQLDate): Long = {
-    daysToMillis(days, defaultTimeZone().toZoneId)
+  def daysToMicros(days: SQLDate): SQLTimestamp = {
+    daysToMicros(days, defaultTimeZone().toZoneId)
   }
 
-  def daysToMillis(days: SQLDate, zoneId: ZoneId): Long = {
+  def daysToMicros(days: SQLDate, zoneId: ZoneId): SQLTimestamp = {
     val instant = daysToLocalDate(days).atStartOfDay(zoneId).toInstant
-    toMillis(instantToMicros(instant))
+    instantToMicros(instant)
   }
 
   // Converts Timestamp to string according to Hive TimestampWritable convention.
@@ -88,14 +86,14 @@ object DateTimeUtils {
    * Returns the number of days since epoch from java.sql.Date.
    */
   def fromJavaDate(date: Date): SQLDate = {
-    millisToDays(date.getTime)
+    microsToDays(millisToMicros(date.getTime))
   }
 
   /**
    * Returns a java.sql.Date from number of days since epoch.
    */
   def toJavaDate(daysSinceEpoch: SQLDate): Date = {
-    new Date(daysToMillis(daysSinceEpoch))
+    new Date(microsToMillis(daysToMicros(daysSinceEpoch)))
   }
 
   /**
@@ -138,7 +136,7 @@ object DateTimeUtils {
    * Converts the timestamp to milliseconds since epoch. In spark timestamp values have microseconds
    * precision, so this conversion is lossy.
    */
-  def toMillis(us: SQLTimestamp): Long = {
+  def microsToMillis(us: SQLTimestamp): Long = {
     // When the timestamp is negative i.e before 1970, we need to adjust the millseconds portion.
     // Example - 1965-01-01 10:11:12.123456 is represented as (-157700927876544) in micro precision.
     // In millis precision the above needs to be represented as (-157700927877).
@@ -148,7 +146,7 @@ object DateTimeUtils {
   /*
    * Converts milliseconds since epoch to SQLTimestamp.
    */
-  def fromMillis(millis: Long): SQLTimestamp = {
+  def millisToMicros(millis: Long): SQLTimestamp = {
     Math.multiplyExact(millis, MICROS_PER_MILLIS)
   }
 
@@ -574,10 +572,8 @@ object DateTimeUtils {
       time2: SQLTimestamp,
       roundOff: Boolean,
       zoneId: ZoneId): Double = {
-    val millis1 = toMillis(time1)
-    val millis2 = toMillis(time2)
-    val date1 = millisToDays(millis1, zoneId)
-    val date2 = millisToDays(millis2, zoneId)
+    val date1 = microsToDays(time1, zoneId)
+    val date2 = microsToDays(time2, zoneId)
     val (year1, monthInYear1, dayInMonth1, daysToMonthEnd1) = splitDate(date1)
     val (year2, monthInYear2, dayInMonth2, daysToMonthEnd2) = splitDate(date2)
 
@@ -591,8 +587,8 @@ object DateTimeUtils {
     }
     // using milliseconds can cause precision loss with more than 8 digits
     // we follow Hive's implementation which uses seconds
-    val secondsInDay1 = MILLISECONDS.toSeconds(millis1 - daysToMillis(date1, zoneId))
-    val secondsInDay2 = MILLISECONDS.toSeconds(millis2 - daysToMillis(date2, zoneId))
+    val secondsInDay1 = MICROSECONDS.toSeconds(time1 - daysToMicros(date1, zoneId))
+    val secondsInDay2 = MICROSECONDS.toSeconds(time2 - daysToMicros(date2, zoneId))
     val secondsDiff = (dayInMonth1 - dayInMonth2) * SECONDS_PER_DAY + secondsInDay1 - secondsInDay2
     val secondsInMonth = DAYS.toSeconds(31)
     val diff = monthDiff + secondsDiff / secondsInMonth.toDouble
@@ -711,21 +707,17 @@ object DateTimeUtils {
   def truncTimestamp(t: SQLTimestamp, level: Int, zoneId: ZoneId): SQLTimestamp = {
     level match {
       case TRUNC_TO_MICROSECOND => t
+      case TRUNC_TO_MILLISECOND =>
+        t - Math.floorMod(t, MICROS_PER_MILLIS)
+      case TRUNC_TO_SECOND =>
+        t - Math.floorMod(t, MICROS_PER_SECOND)
+      case TRUNC_TO_MINUTE =>
+        t - Math.floorMod(t, MICROS_PER_MINUTE)
       case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS)
       case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS)
-      case _ =>
-        val millis = toMillis(t)
-        val truncated = level match {
-          case TRUNC_TO_MILLISECOND => millis
-          case TRUNC_TO_SECOND =>
-            millis - Math.floorMod(millis, MILLIS_PER_SECOND)
-          case TRUNC_TO_MINUTE =>
-            millis - Math.floorMod(millis, MILLIS_PER_MINUTE)
-          case _ => // Try to truncate date levels
-            val dDays = millisToDays(millis, zoneId)
-            daysToMillis(truncDate(dDays, level), zoneId)
-        }
-        fromMillis(truncated)
+      case _ => // Try to truncate date levels
+        val dDays = microsToDays(t, zoneId)
+        daysToMicros(truncDate(dDays, level), zoneId)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
index 0a13ec87c2897..008ba61b2ae03 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
@@ -23,7 +23,7 @@ import java.util.concurrent.TimeUnit
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.fromMillis
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.Decimal
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -736,7 +736,7 @@ object IntervalUtils {
                   microseconds = Math.addExact(microseconds, minutesUs)
                   i += minuteStr.numBytes()
                 } else if (s.matchAt(millisStr, i)) {
-                  val millisUs = fromMillis(currentValue)
+                  val millisUs = millisToMicros(currentValue)
                   microseconds = Math.addExact(microseconds, millisUs)
                   i += millisStr.numBytes()
                 } else if (s.matchAt(microsStr, i)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index fd77326d6b016..b70a4edd53861 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -141,7 +141,7 @@ class LegacyFastTimestampFormatter(
     }
     val micros = cal.getMicros()
     cal.set(Calendar.MILLISECOND, 0)
-    Math.addExact(fromMillis(cal.getTimeInMillis), micros)
+    Math.addExact(millisToMicros(cal.getTimeInMillis), micros)
   }
 
   def format(timestamp: SQLTimestamp): String = {
@@ -164,7 +164,7 @@ class LegacySimpleTimestampFormatter(
   }
 
   override def parse(s: String): Long = {
-    fromMillis(sdf.parse(s).getTime)
+    millisToMicros(sdf.parse(s).getTime)
   }
 
   override def format(us: Long): String = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
index 536c76f042d23..8c5df2660f310 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -135,10 +135,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
       dateOptions.dateFormat,
       TimeZone.getTimeZone(dateOptions.zoneId),
       dateOptions.locale)
-    val expectedDate = format.parse(customDate).getTime
+    val expectedDate = DateTimeUtils.millisToMicros(format.parse(customDate).getTime)
     val castedDate = parser.makeConverter("_1", DateType, nullable = true)
         .apply(customDate)
-    assert(castedDate == DateTimeUtils.millisToDays(expectedDate, ZoneOffset.UTC))
+    assert(castedDate == DateTimeUtils.microsToDays(expectedDate, ZoneOffset.UTC))
 
     val timestamp = "2015-01-01 00:00:00"
     timestampsOptions = new CSVOptions(Map(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index bde95f0e3add5..302a246c39377 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -271,13 +271,13 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(
         cast(cast(new Timestamp(c.getTimeInMillis), StringType, timeZoneId),
           TimestampType, timeZoneId),
-        fromMillis(c.getTimeInMillis))
+        millisToMicros(c.getTimeInMillis))
       c = Calendar.getInstance(TimeZoneGMT)
       c.set(2015, 10, 1, 2, 30, 0)
       checkEvaluation(
         cast(cast(new Timestamp(c.getTimeInMillis), StringType, timeZoneId),
           TimestampType, timeZoneId),
-        fromMillis(c.getTimeInMillis))
+        millisToMicros(c.getTimeInMillis))
     }
 
     val gmtId = Option("GMT")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index f1ad75321776d..e43eb594286c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -47,7 +47,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   def toMillis(timestamp: String): Long = {
     val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", ZoneOffset.UTC)
-    DateTimeUtils.toMillis(tf.parse(timestamp))
+    DateTimeUtils.microsToMillis(tf.parse(timestamp))
   }
   val date = "2015-04-08 13:10:15"
   val d = new Date(toMillis(date))
@@ -55,9 +55,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   val ts = new Timestamp(toMillis(time))
 
   test("datetime function current_date") {
-    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis(), ZoneOffset.UTC)
+    val d0 = DateTimeUtils.currentDate(ZoneOffset.UTC)
     val cd = CurrentDate(gmtId).eval(EmptyRow).asInstanceOf[Int]
-    val d1 = DateTimeUtils.millisToDays(System.currentTimeMillis(), ZoneOffset.UTC)
+    val d1 = DateTimeUtils.currentDate(ZoneOffset.UTC)
     assert(d0 <= cd && cd <= d1 && d1 - d0 <= 1)
 
     val cdjst = CurrentDate(jstId).eval(EmptyRow).asInstanceOf[Int]
@@ -787,15 +787,15 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
               1000L)
             checkEvaluation(
               UnixTimestamp(Literal(date1), Literal("yyyy-MM-dd HH:mm:ss"), timeZoneId),
-              MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
+              MICROSECONDS.toSeconds(
+                DateTimeUtils.daysToMicros(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               UnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))),
                 Literal(fmt2), timeZoneId),
               -1000L)
             checkEvaluation(UnixTimestamp(
               Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
-              MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
+              MICROSECONDS.toSeconds(DateTimeUtils.daysToMicros(
                 DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz.toZoneId)))
             val t1 = UnixTimestamp(
               CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
@@ -813,8 +813,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
               null)
             checkEvaluation(
               UnixTimestamp(Literal(date1), Literal.create(null, StringType), timeZoneId),
-              MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
+              MICROSECONDS.toSeconds(
+                DateTimeUtils.daysToMicros(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format"), timeZoneId), null)
           }
@@ -851,8 +851,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
               1000L)
             checkEvaluation(
               ToUnixTimestamp(Literal(date1), Literal(fmt1), timeZoneId),
-              MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
+              MICROSECONDS.toSeconds(
+                DateTimeUtils.daysToMicros(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               ToUnixTimestamp(
                 Literal(sdf2.format(new Timestamp(-1000000))),
@@ -860,7 +860,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
               -1000L)
             checkEvaluation(ToUnixTimestamp(
               Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3), timeZoneId),
-              MILLISECONDS.toSeconds(DateTimeUtils.daysToMillis(
+              MICROSECONDS.toSeconds(DateTimeUtils.daysToMicros(
                 DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24")), tz.toZoneId)))
             val t1 = ToUnixTimestamp(
               CurrentTimestamp(), Literal(fmt1)).eval().asInstanceOf[Long]
@@ -875,8 +875,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
               null)
             checkEvaluation(ToUnixTimestamp(
               Literal(date1), Literal.create(null, StringType), timeZoneId),
-              MILLISECONDS.toSeconds(
-                DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
+              MICROSECONDS.toSeconds(
+                DateTimeUtils.daysToMicros(DateTimeUtils.fromJavaDate(date1), tz.toZoneId)))
             checkEvaluation(
               ToUnixTimestamp(
                 Literal("2015-07-24"),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ComputeCurrentTimeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ComputeCurrentTimeSuite.scala
index 10ed4e46ddd1c..db0399d2a73ee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ComputeCurrentTimeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ComputeCurrentTimeSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import java.time.ZoneId
+
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal}
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -51,9 +53,9 @@ class ComputeCurrentTimeSuite extends PlanTest {
   test("analyzer should replace current_date with literals") {
     val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation())
 
-    val min = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val min = DateTimeUtils.currentDate(ZoneId.systemDefault())
     val plan = Optimize.execute(in.analyze).asInstanceOf[Project]
-    val max = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val max = DateTimeUtils.currentDate(ZoneId.systemDefault())
 
     val lits = new scala.collection.mutable.ArrayBuffer[Int]
     plan.transformAllExpressions { case e: Literal =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 2e689be01f338..1465b066434bc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -89,7 +89,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
 
   test("SPARK-6785: java date conversion before and after epoch") {
     def format(d: Date): String = {
-      TimestampFormatter("uuuu-MM-dd", defaultTimeZone().toZoneId).format(fromMillis(d.getTime))
+      TimestampFormatter("uuuu-MM-dd", defaultTimeZone().toZoneId)
+        .format(millisToMicros(d.getTime))
     }
     def checkFromToJavaDate(d1: Date): Unit = {
       val d2 = toJavaDate(fromJavaDate(d1))
@@ -582,17 +583,17 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
     }
   }
 
-  test("daysToMillis and millisToDays") {
-    val input = toMillis(date(2015, 12, 31, 16, zid = zonePST))
-    assert(millisToDays(input, zonePST) === 16800)
-    assert(millisToDays(input, ZoneOffset.UTC) === 16801)
-    assert(millisToDays(-1 * MILLIS_PER_DAY + 1, ZoneOffset.UTC) == -1)
+  test("daysToMicros and microsToDays") {
+    val input = date(2015, 12, 31, 16, zid = zonePST)
+    assert(microsToDays(input, zonePST) === 16800)
+    assert(microsToDays(input, ZoneOffset.UTC) === 16801)
+    assert(microsToDays(-1 * MILLIS_PER_DAY + 1, ZoneOffset.UTC) == -1)
 
-    var expected = toMillis(date(2015, 12, 31, zid = zonePST))
-    assert(daysToMillis(16800, zonePST) === expected)
+    var expected = date(2015, 12, 31, zid = zonePST)
+    assert(daysToMicros(16800, zonePST) === expected)
 
-    expected = toMillis(date(2015, 12, 31, zid = zoneGMT))
-    assert(daysToMillis(16800, ZoneOffset.UTC) === expected)
+    expected = date(2015, 12, 31, zid = zoneGMT)
+    assert(daysToMicros(16800, ZoneOffset.UTC) === expected)
 
     // There are some days are skipped entirely in some timezone, skip them here.
     val skipped_days = Map[String, Set[Int]](
@@ -607,16 +608,16 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       val skipped = skipped_days.getOrElse(tz.getID, Set.empty)
       (-20000 to 20000).foreach { d =>
         if (!skipped.contains(d)) {
-          assert(millisToDays(daysToMillis(d, tz.toZoneId), tz.toZoneId) === d,
+          assert(microsToDays(daysToMicros(d, tz.toZoneId), tz.toZoneId) === d,
             s"Round trip of ${d} did not work in tz ${tz}")
         }
       }
     }
   }
 
-  test("toMillis") {
-    assert(DateTimeUtils.toMillis(-9223372036844776001L) === -9223372036844777L)
-    assert(DateTimeUtils.toMillis(-157700927876544L) === -157700927877L)
+  test("microsToMillis") {
+    assert(DateTimeUtils.microsToMillis(-9223372036844776001L) === -9223372036844777L)
+    assert(DateTimeUtils.microsToMillis(-157700927876544L) === -157700927877L)
   }
 
   test("special timestamp values") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
index e7c3163d0ac09..25e706097efd7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
@@ -22,7 +22,7 @@ import java.util.concurrent.TimeUnit
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.fromMillis
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros
 import org.apache.spark.sql.catalyst.util.IntervalUtils._
 import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit._
 import org.apache.spark.sql.internal.SQLConf
@@ -77,7 +77,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     testSingleUnit("HouR", 3, 0, 0, 3 * MICROS_PER_HOUR)
     testSingleUnit("MiNuTe", 3, 0, 0, 3 * MICROS_PER_MINUTE)
     testSingleUnit("Second", 3, 0, 0, 3 * MICROS_PER_SECOND)
-    testSingleUnit("MilliSecond", 3, 0, 0, fromMillis(3))
+    testSingleUnit("MilliSecond", 3, 0, 0, millisToMicros(3))
     testSingleUnit("MicroSecond", 3, 0, 0, 3)
 
     checkFromInvalidString(null, "cannot be null")
@@ -176,7 +176,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
         new CalendarInterval(
           0,
           10,
-          12 * MICROS_PER_MINUTE + fromMillis(888)))
+          12 * MICROS_PER_MINUTE + millisToMicros(888)))
       assert(fromDayTimeString("-3 0:0:0") === new CalendarInterval(0, -3, 0L))
 
       try {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 329465544979d..6fccb629a3811 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -298,7 +298,7 @@ private void decodeDictionaryIds(
           for (int i = rowId; i < rowId + num; ++i) {
             if (!column.isNullAt(i)) {
               column.putLong(i,
-                DateTimeUtils.fromMillis(dictionary.decodeToLong(dictionaryIds.getDictId(i))));
+                DateTimeUtils.millisToMicros(dictionary.decodeToLong(dictionaryIds.getDictId(i))));
             }
           }
         } else {
@@ -432,7 +432,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro
     } else if (originalType == OriginalType.TIMESTAMP_MILLIS) {
       for (int i = 0; i < num; i++) {
         if (defColumn.readInteger() == maxDefLevel) {
-          column.putLong(rowId + i, DateTimeUtils.fromMillis(dataColumn.readLong()));
+          column.putLong(rowId + i, DateTimeUtils.millisToMicros(dataColumn.readLong()));
         } else {
           column.putNull(rowId + i);
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
index fda4e148b640f..637ce68ec05a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -111,7 +111,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister {
           case (PATH, i) => writer.write(i, UTF8String.fromString(status.getPath.toString))
           case (LENGTH, i) => writer.write(i, status.getLen)
           case (MODIFICATION_TIME, i) =>
-            writer.write(i, DateTimeUtils.fromMillis(status.getModificationTime))
+            writer.write(i, DateTimeUtils.millisToMicros(status.getModificationTime))
           case (CONTENT, i) =>
             if (status.getLen > maxLength) {
               throw new SparkException(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
index 850adae8a6b95..ff686d024f111 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -272,7 +272,7 @@ private[parquet] class ParquetRowConverter(
       case TimestampType if parquetType.getOriginalType == OriginalType.TIMESTAMP_MILLIS =>
         new ParquetPrimitiveConverter(updater) {
           override def addLong(value: Long): Unit = {
-            updater.setLong(DateTimeUtils.fromMillis(value))
+            updater.setLong(DateTimeUtils.millisToMicros(value))
           }
         }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
index f6490614ab05b..bfa33ea237395 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
@@ -183,7 +183,7 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
 
           case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MILLIS =>
             (row: SpecializedGetters, ordinal: Int) =>
-              val millis = DateTimeUtils.toMillis(row.getLong(ordinal))
+              val millis = DateTimeUtils.microsToMillis(row.getLong(ordinal))
               recordConsumer.addLong(millis)
         }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
index 7dd52c1feabf2..20fb06a851dd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToMillis
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.MetadataBuilder
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -100,7 +100,7 @@ case class EventTimeWatermarkExec(
     child.execute().mapPartitions { iter =>
       val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
       iter.map { row =>
-        eventTimeStats.add(toMillis(getEventTime(row).getLong(0)))
+        eventTimeStats.add(microsToMillis(getEventTime(row).getLong(0)))
         row
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
index d40208f02e7b1..f29970d5de192 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
@@ -22,7 +22,7 @@ import java.util.concurrent.TimeUnit
 import scala.concurrent.duration.Duration
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToMillis
 import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.streaming.Trigger
 import org.apache.spark.unsafe.types.UTF8String
@@ -38,7 +38,7 @@ private object Triggers {
       throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
     }
     val microsInDays = Math.multiplyExact(cal.days, MICROS_PER_DAY)
-    toMillis(Math.addExact(cal.microseconds, microsInDays))
+    microsToMillis(Math.addExact(cal.microseconds, microsInDays))
   }
 
   def convert(interval: Duration): Long = interval.toMillis
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index e66a1fe48a2e0..08840496b052b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -146,7 +146,7 @@ class RateStreamContinuousPartitionReader(
     }
 
     currentRow = InternalRow(
-      DateTimeUtils.fromMillis(nextReadTime),
+      DateTimeUtils.millisToMicros(nextReadTime),
       currentValue)
 
     true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala
index eb6baf698a5b9..2e24de4f0d142 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala
@@ -191,7 +191,7 @@ class RateStreamMicroBatchPartitionReader(
     val currValue = rangeStart + partitionId + numPartitions * count
     count += 1
     val relative = math.round((currValue - rangeStart) * relativeMsPerValue)
-    InternalRow(DateTimeUtils.fromMillis(relative + localStartTimeMs), currValue)
+    InternalRow(DateTimeUtils.millisToMicros(relative + localStartTimeMs), currValue)
   }
 
   override def close(): Unit = {}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala
index 97a6576832515..04431f3d381a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala
@@ -85,7 +85,7 @@ class TextSocketMicroBatchStream(host: String, port: Int, numPartitions: Int)
             TextSocketMicroBatchStream.this.synchronized {
               val newData = (
                 UTF8String.fromString(line),
-                DateTimeUtils.fromMillis(Calendar.getInstance().getTimeInMillis)
+                DateTimeUtils.millisToMicros(Calendar.getInstance().getTimeInMillis)
               )
               currentOffset += 1
               batches.append(newData)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 527dfe56c2289..fd65f7513aa6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.time.{Instant, LocalDateTime}
+import java.time.{Instant, LocalDateTime, ZoneId}
 import java.util.{Locale, TimeZone}
 import java.util.concurrent.TimeUnit
 
@@ -35,11 +35,11 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
 
   test("function current_date") {
     val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
-    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val d0 = DateTimeUtils.currentDate(ZoneId.systemDefault())
     val d1 = DateTimeUtils.fromJavaDate(df1.select(current_date()).collect().head.getDate(0))
     val d2 = DateTimeUtils.fromJavaDate(
       sql("""SELECT CURRENT_DATE()""").collect().head.getDate(0))
-    val d3 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val d3 = DateTimeUtils.currentDate(ZoneId.systemDefault())
     assert(d0 <= d1 && d1 <= d2 && d2 <= d3 && d3 - d0 <= 1)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index b6ea26ab95549..3d7777bff09b4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -51,10 +51,10 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
   private val d2 = Date.valueOf(d2Str)
   private val t1Str = "2016-05-08 00:00:01.000000"
   private val t1Internal = date(2016, 5, 8, 0, 0, 1)
-  private val t1 = new Timestamp(DateTimeUtils.toMillis(t1Internal))
+  private val t1 = new Timestamp(DateTimeUtils.microsToMillis(t1Internal))
   private val t2Str = "2016-05-09 00:00:02.000000"
   private val t2Internal = date(2016, 5, 9, 0, 0, 2)
-  private val t2 = new Timestamp(DateTimeUtils.toMillis(t2Internal))
+  private val t2 = new Timestamp(DateTimeUtils.microsToMillis(t2Internal))
 
   /**
    * Define a very simple 3 row table used for testing column serialization.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 7abe818a29d9f..92f862a542976 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -123,7 +123,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
           Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ssXXX")))
 
     val ISO8601Date = "1970-01-01"
-    checkTypePromotion(DateTimeUtils.millisToDays(32400000),
+    checkTypePromotion(DateTimeUtils.microsToDays(32400000000L),
       enforceCorrectType(ISO8601Date, DateType))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index b6618826487c6..bbbf7e5c4697e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.Range
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
+import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.{LocalLimitExec, SimpleMode, SparkPlan}
 import org.apache.spark.sql.execution.command.ExplainCommand
@@ -1216,7 +1217,7 @@ class StreamSuite extends StreamTest {
     }
 
     var lastTimestamp = System.currentTimeMillis()
-    val currentDate = DateTimeUtils.millisToDays(lastTimestamp)
+    val currentDate = DateTimeUtils.microsToDays(DateTimeUtils.millisToMicros(lastTimestamp))
     testStream(df) (
       AddData(input, 1),
       CheckLastBatch { rows: Seq[Row] =>

From c46c067f39213df9b3ee5a51e7d7803b867a0d54 Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@nvidia.com>
Date: Tue, 25 Feb 2020 10:55:56 -0600
Subject: [PATCH 144/185] [SPARK-30942] Fix the warning for requiring cores to
 be limiting resources

### What changes were proposed in this pull request?

fix the warning for limiting resources when we don't know the number of executor cores. The issue is that there are places in the Spark code that use cores/task cpus to calculate slots and until the entire Stage level scheduling feature is in, we have to rely on the cores being the limiting resource.

Change the check to only warn when custom resources are specified.

### Why are the changes needed?

fix the check and warn when we should

### Does this PR introduce any user-facing change?

A warning is printed

### How was this patch tested?

manually tested spark-shell with standalone mode, yarn, local mode.

Closes #27686 from tgravescs/SPARK-30942.

Authored-by: Thomas Graves <tgraves@nvidia.com>
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 core/src/main/scala/org/apache/spark/SparkContext.scala    | 2 +-
 .../scala/org/apache/spark/resource/ResourceProfile.scala  | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index a47136ea36736..f377f13d30ec2 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2798,7 +2798,7 @@ object SparkContext extends Logging {
         defaultProf.maxTasksPerExecutor(sc.conf) < cpuSlots) {
         throw new IllegalArgumentException("The number of slots on an executor has to be " +
           "limited by the number of cores, otherwise you waste resources and " +
-          "dynamic allocation doesn't work properly. Your configuration has " +
+          "some scheduling doesn't work properly. Your configuration has " +
           s"core/task cpu slots = ${cpuSlots} and " +
           s"${limitingResource} = " +
           s"${defaultProf.maxTasksPerExecutor(sc.conf)}. Please adjust your configuration " +
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
index 2608ab9fc0d4c..5b2476c18caea 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
@@ -168,7 +168,7 @@ class ResourceProfile(
             // limiting resource because the scheduler code uses that for slots
             throw new IllegalArgumentException("The number of slots on an executor has to be " +
               "limited by the number of cores, otherwise you waste resources and " +
-              "dynamic allocation doesn't work properly. Your configuration has " +
+              "some scheduling doesn't work properly. Your configuration has " +
               s"core/task cpu slots = ${taskLimit} and " +
               s"${execReq.resourceName} = ${numTasks}. " +
               "Please adjust your configuration so that all resources require same number " +
@@ -183,12 +183,11 @@ class ResourceProfile(
           "no corresponding task resource request was specified.")
       }
     }
-    if(!shouldCheckExecCores && Utils.isDynamicAllocationEnabled(sparkConf)) {
+    if(!shouldCheckExecCores && execResourceToCheck.nonEmpty) {
       // if we can't rely on the executor cores config throw a warning for user
       logWarning("Please ensure that the number of slots available on your " +
         "executors is limited by the number of cores to task cpus and not another " +
-        "custom resource. If cores is not the limiting resource then dynamic " +
-        "allocation will not work properly!")
+        "custom resource.")
     }
     if (taskResourcesToCheck.nonEmpty) {
       throw new SparkException("No executor resource configs were not specified for the " +

From e086a78706e1bfbcfe48736596083cb004574c92 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Tue, 25 Feb 2020 12:32:12 -0600
Subject: [PATCH 145/185] [MINOR][ML] ML cleanup

### What changes were proposed in this pull request?
1, remove used imports and variables;
2, use `.iterator` instead of `.view` to avoid IDEA warnings;
3, remove resolved _TODO_

### Why are the changes needed?
cleanup

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing testsuites

Closes #27600 from zhengruifeng/nits.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../main/scala/org/apache/spark/ml/linalg/BLAS.scala |  4 +---
 .../main/scala/org/apache/spark/ml/Pipeline.scala    |  4 ++--
 .../apache/spark/ml/attribute/AttributeGroup.scala   | 10 ++++------
 .../spark/ml/classification/FMClassifier.scala       |  1 -
 .../MultilayerPerceptronClassifier.scala             |  2 --
 .../ml/classification/RandomForestClassifier.scala   |  2 +-
 .../evaluation/BinaryClassificationEvaluator.scala   |  6 ++++--
 .../spark/ml/evaluation/ClusteringEvaluator.scala    |  2 +-
 .../org/apache/spark/ml/feature/MinMaxScaler.scala   |  2 --
 .../ml/r/GeneralizedLinearRegressionWrapper.scala    |  1 -
 .../org/apache/spark/ml/recommendation/ALS.scala     |  8 ++++----
 .../org/apache/spark/ml/regression/FMRegressor.scala |  1 -
 .../spark/ml/source/image/ImageFileFormat.scala      |  5 ++---
 .../org/apache/spark/ml/tree/impl/RandomForest.scala |  2 +-
 .../spark/mllib/clustering/StreamingKMeans.scala     |  7 +++----
 .../apache/spark/mllib/feature/ChiSqSelector.scala   |  5 +++--
 .../spark/mllib/feature/ElementwiseProduct.scala     |  5 +++--
 .../scala/org/apache/spark/mllib/feature/IDF.scala   | 12 +++++++-----
 .../scala/org/apache/spark/mllib/linalg/BLAS.scala   |  4 +---
 .../spark/mllib/optimization/GradientDescent.scala   |  2 +-
 .../mllib/stat/correlation/SpearmanCorrelation.scala |  4 +---
 .../apache/spark/mllib/tree/impurity/Entropy.scala   |  4 +++-
 .../spark/mllib/tree/model/treeEnsembleModels.scala  |  2 +-
 .../spark/mllib/util/LinearDataGenerator.scala       |  2 --
 24 files changed, 43 insertions(+), 54 deletions(-)

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
index e054a15fc9b75..ea1219165b980 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
@@ -682,7 +682,6 @@ private[spark] object BLAS extends Serializable {
 
           val xTemp = xValues(k) * alpha
           while (i < indEnd) {
-            val rowIndex = Arows(i)
             yValues(Arows(i)) += Avals(i) * xTemp
             i += 1
           }
@@ -734,8 +733,7 @@ private[spark] object BLAS extends Serializable {
         val indEnd = Acols(colCounterForA + 1)
         val xVal = xValues(colCounterForA) * alpha
         while (i < indEnd) {
-          val rowIndex = Arows(i)
-          yValues(rowIndex) += Avals(i) * xVal
+          yValues(Arows(i)) += Avals(i) * xVal
           i += 1
         }
         colCounterForA += 1
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 69a4dbef138e3..d44031fc7a11d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -139,7 +139,7 @@ class Pipeline @Since("1.4.0") (
     val theStages = $(stages)
     // Search for the last estimator.
     var indexOfLastEstimator = -1
-    theStages.view.zipWithIndex.foreach { case (stage, index) =>
+    theStages.iterator.zipWithIndex.foreach { case (stage, index) =>
       stage match {
         case _: Estimator[_] =>
           indexOfLastEstimator = index
@@ -148,7 +148,7 @@ class Pipeline @Since("1.4.0") (
     }
     var curDataset = dataset
     val transformers = ListBuffer.empty[Transformer]
-    theStages.view.zipWithIndex.foreach { case (stage, index) =>
+    theStages.iterator.zipWithIndex.foreach { case (stage, index) =>
       if (index <= indexOfLastEstimator) {
         val transformer = stage match {
           case estimator: Estimator[_] =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 21a246e454c83..b3d24b70fc55e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -67,14 +67,12 @@ class AttributeGroup private (
   /**
    * Optional array of attributes. At most one of `numAttributes` and `attributes` can be defined.
    */
-  val attributes: Option[Array[Attribute]] = attrs.map(_.view.zipWithIndex.map { case (attr, i) =>
-    attr.withIndex(i)
-  }.toArray)
+  val attributes: Option[Array[Attribute]] = attrs.map(_.iterator.zipWithIndex
+    .map { case (attr, i) => attr.withIndex(i) }.toArray)
 
   private lazy val nameToIndex: Map[String, Int] = {
-    attributes.map(_.view.flatMap { attr =>
-      attr.name.map(_ -> attr.index.get)
-    }.toMap).getOrElse(Map.empty)
+    attributes.map(_.iterator.flatMap { attr => attr.name.map(_ -> attr.index.get)}.toMap)
+      .getOrElse(Map.empty)
   }
 
   /** Size of the attribute group. Returns -1 if the size is unknown. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala
index d511c1b5dda98..87ae11b32b674 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala
@@ -31,7 +31,6 @@ import org.apache.spark.mllib.linalg.{Vector => OldVector}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.functions.col
 import org.apache.spark.storage.StorageLevel
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 6e8f92b9b1e64..6bffc372b68fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.ml.classification
 
-import scala.collection.JavaConverters._
-
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.Since
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 9ed81bf893450..a316e472d9674 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -256,7 +256,7 @@ class RandomForestClassificationModel private[ml] (
     // Classifies using majority votes.
     // Ignore the tree weights since all are 1.0 for now.
     val votes = Array.ofDim[Double](numClasses)
-    _trees.view.foreach { tree =>
+    _trees.foreach { tree =>
       val classCounts = tree.rootNode.predictImpl(features).impurityStats.stats
       val total = classCounts.sum
       if (total != 0) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 55b910e98d405..ad3f87c398d1a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, MetadataUtils, SchemaUtils}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -104,7 +104,9 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
       SchemaUtils.checkNumericType(schema, $(weightCol))
     }
 
-    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
+    MetadataUtils.getNumFeatures(schema($(rawPredictionCol)))
+      .foreach(n => require(n == 2, s"rawPredictionCol vectors must have length=2, but got $n"))
+
     val scoreAndLabelsWithWeights =
       dataset.select(
         col($(rawPredictionCol)),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
index 2b3f431fbfe33..641a1eb5f61db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.evaluation
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
 import org.apache.spark.ml.util._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index c84892c974b90..90187c331e835 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -144,8 +144,6 @@ object MinMaxScaler extends DefaultParamsReadable[MinMaxScaler] {
  *
  * @param originalMin min value for each original column during fitting
  * @param originalMax max value for each original column during fitting
- *
- * TODO: The transformer does not yet set the metadata in the output column (SPARK-8529).
  */
 @Since("1.5.0")
 class MinMaxScalerModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 64575b0cb0cb5..cc5f61eab0dd5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -25,7 +25,6 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.feature.RFormula
 import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.regression._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 002146f89e79a..9404dcdb6f2d4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -1049,7 +1049,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
       .join(userFactors)
       .mapPartitions({ items =>
         items.flatMap { case (_, (ids, factors)) =>
-          ids.view.zip(factors)
+          ids.iterator.zip(factors.iterator)
         }
       // Preserve the partitioning because IDs are consistent with the partitioners in userInBlocks
       // and userFactors.
@@ -1061,7 +1061,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
       .join(itemFactors)
       .mapPartitions({ items =>
         items.flatMap { case (_, (ids, factors)) =>
-          ids.view.zip(factors)
+          ids.iterator.zip(factors.iterator)
         }
       }, preservesPartitioning = true)
       .setName("itemFactors")
@@ -1376,7 +1376,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
           Iterator.empty
         }
       } ++ {
-        builders.view.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) =>
+        builders.iterator.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) =>
           val srcBlockId = idx % srcPart.numPartitions
           val dstBlockId = idx / srcPart.numPartitions
           ((srcBlockId, dstBlockId), block.build())
@@ -1695,7 +1695,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
     val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None
     val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap {
       case (srcBlockId, (srcOutBlock, srcFactors)) =>
-        srcOutBlock.view.zipWithIndex.map { case (activeIndices, dstBlockId) =>
+        srcOutBlock.iterator.zipWithIndex.map { case (activeIndices, dstBlockId) =>
           (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx))))
         }
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala
index f7810eb17cf59..6461faf1543f7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala
@@ -40,7 +40,6 @@ import org.apache.spark.mllib.optimization.{Gradient, GradientDescent, SquaredL2
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.functions.col
 import org.apache.spark.storage.StorageLevel
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
index c3321447e3c96..7c09ce5b7a781 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
@@ -26,9 +26,8 @@ import org.apache.spark.ml.image.ImageSchema
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, UnsafeRow}
-import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.execution.datasources.{DataSource, FileFormat, OutputWriterFactory, PartitionedFile}
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
 import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index e0382c694b038..b6bc7aaeed628 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -825,7 +825,7 @@ private[spark] object RandomForest extends Logging with Serializable {
     }
 
     val validFeatureSplits =
-      Range(0, binAggregates.metadata.numFeaturesPerNode).view.map { featureIndexIdx =>
+      Iterator.range(0, binAggregates.metadata.numFeaturesPerNode).map { featureIndexIdx =>
         featuresForNode.map(features => (featureIndexIdx, features(featureIndexIdx)))
           .getOrElse((featureIndexIdx, featureIndexIdx))
       }.withFilter { case (_, featureIndex) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index c7d44e8752cd9..3c9b806d616fc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -95,7 +95,7 @@ class StreamingKMeansModel @Since("1.2.0") (
     val discount = timeUnit match {
       case StreamingKMeans.BATCHES => decayFactor
       case StreamingKMeans.POINTS =>
-        val numNewPoints = pointStats.view.map { case (_, (_, n)) =>
+        val numNewPoints = pointStats.iterator.map { case (_, (_, n)) =>
           n
         }.sum
         math.pow(decayFactor, numNewPoints)
@@ -125,9 +125,8 @@ class StreamingKMeansModel @Since("1.2.0") (
     }
 
     // Check whether the smallest cluster is dying. If so, split the largest cluster.
-    val weightsWithIndex = clusterWeights.view.zipWithIndex
-    val (maxWeight, largest) = weightsWithIndex.maxBy(_._1)
-    val (minWeight, smallest) = weightsWithIndex.minBy(_._1)
+    val (maxWeight, largest) = clusterWeights.iterator.zipWithIndex.maxBy(_._1)
+    val (minWeight, smallest) = clusterWeights.iterator.zipWithIndex.minBy(_._1)
     if (minWeight < 1e-8 * maxWeight) {
       logInfo(s"Cluster $smallest is dying. Split the largest cluster $largest into two.")
       val weight = (maxWeight + minWeight) / 2.0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index b771e077b02ac..d970c3c3d6131 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -74,8 +74,9 @@ class ChiSqSelectorModel @Since("1.3.0") (
     }
   }
 
-  private[spark] def compressSparse(indices: Array[Int],
-                                    values: Array[Double]): (Array[Int], Array[Double]) = {
+  private[spark] def compressSparse(
+      indices: Array[Int],
+      values: Array[Double]): (Array[Int], Array[Double]) = {
     val newValues = new ArrayBuilder.ofDouble
     val newIndices = new ArrayBuilder.ofInt
     var i = 0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index 058598c757843..ea87e9ae3f71e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -64,8 +64,9 @@ class ElementwiseProduct @Since("1.4.0") (
     newValues
   }
 
-  private[spark] def transformSparse(indices: Array[Int],
-                                     values: Array[Double]): (Array[Int], Array[Double]) = {
+  private[spark] def transformSparse(
+      indices: Array[Int],
+      values: Array[Double]): (Array[Int], Array[Double]) = {
     val newValues = values.clone()
     val dim = newValues.length
     var i = 0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index e868f0f92509a..f1517f976835a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -226,8 +226,9 @@ private[spark] object IDFModel {
     }
   }
 
-  private[spark] def transformDense(idf: Vector,
-                                    values: Array[Double]): Array[Double] = {
+  private[spark] def transformDense(
+      idf: Vector,
+      values: Array[Double]): Array[Double] = {
     val n = values.length
     val newValues = new Array[Double](n)
     var j = 0
@@ -238,9 +239,10 @@ private[spark] object IDFModel {
     newValues
   }
 
-  private[spark] def transformSparse(idf: Vector,
-                                     indices: Array[Int],
-                                     values: Array[Double]): (Array[Int], Array[Double]) = {
+  private[spark] def transformSparse(
+      idf: Vector,
+      indices: Array[Int],
+      values: Array[Double]): (Array[Int], Array[Double]) = {
     val nnz = indices.length
     val newValues = new Array[Double](nnz)
     var k = 0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 1f5558dc2a50e..08086ceff9ef4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -663,7 +663,6 @@ private[spark] object BLAS extends Serializable with Logging {
 
           val xTemp = xValues(k) * alpha
           while (i < indEnd) {
-            val rowIndex = Arows(i)
             yValues(Arows(i)) += Avals(i) * xTemp
             i += 1
           }
@@ -715,8 +714,7 @@ private[spark] object BLAS extends Serializable with Logging {
         val indEnd = Acols(colCounterForA + 1)
         val xVal = xValues(colCounterForA) * alpha
         while (i < indEnd) {
-          val rowIndex = Arows(i)
-          yValues(rowIndex) += Avals(i) * xVal
+          yValues(Arows(i)) += Avals(i) * xVal
           i += 1
         }
         colCounterForA += 1
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index ffe3964981a18..b65b95d894413 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -292,7 +292,7 @@ object GradientDescent extends Logging {
       miniBatchFraction: Double,
       initialWeights: Vector): (Vector, Array[Double]) =
     GradientDescent.runMiniBatchSGD(data, gradient, updater, stepSize, numIterations,
-                                    regParam, miniBatchFraction, initialWeights, 0.001)
+      regParam, miniBatchFraction, initialWeights, 0.001)
 
 
   private def isConverged(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
index ee51d332399e0..aa0bf51ebcd25 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
@@ -46,9 +46,7 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
   override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
     // ((columnIndex, value), rowUid)
     val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
-      vec.toArray.view.zipWithIndex.map { case (v, j) =>
-        ((j, v), uid)
-      }
+      vec.iterator.map(t => (t, uid))
     }
     // global sort by (columnIndex, value)
     val sorted = colBased.sortByKey()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
index f01a98e74886b..d52cb03b74d1e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -25,7 +25,9 @@ import org.apache.spark.annotation.{DeveloperApi, Since}
 @Since("1.0.0")
 object Entropy extends Impurity {
 
-  private[tree] def log2(x: Double) = scala.math.log(x) / scala.math.log(2)
+  private val _log2 = scala.math.log(2)
+
+  private[tree] def log2(x: Double) = scala.math.log(x) / _log2
 
   /**
    * :: DeveloperApi ::
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 810f528c71906..f7e8d98a10568 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -292,7 +292,7 @@ private[tree] sealed class TreeEnsembleModel(
    */
   private def predictByVoting(features: Vector): Double = {
     val votes = mutable.Map.empty[Int, Double]
-    trees.view.zip(treeWeights).foreach { case (tree, weight) =>
+    trees.iterator.zip(treeWeights.iterator).foreach { case (tree, weight) =>
       val prediction = tree.predict(features).toInt
       votes(prediction) = votes.getOrElse(prediction, 0.0) + weight
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 30c77d2af8fbc..de920da0a1f69 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -20,8 +20,6 @@ package org.apache.spark.mllib.util
 import scala.collection.JavaConverters._
 import scala.util.Random
 
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
-
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.linalg.{BLAS, Vectors}

From 8f247e5d3682ad765bdbb9ea5a4315862c5a383c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 25 Feb 2020 14:10:29 -0800
Subject: [PATCH 146/185] [SPARK-30918][SQL] improve the splitting of skewed
 partitions

### What changes were proposed in this pull request?

Use the average size of the non-skewed partitions as the target size when splitting skewed partitions, instead of ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD

### Why are the changes needed?

The goal of skew join optimization is to make the data distribution move even. So it makes more sense the use the average size of the non-skewed partitions as the target size.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

existing tests

Closes #27669 from cloud-fan/aqe.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Xiao Li <gatorsmile@gmail.com>
---
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +--
 .../adaptive/OptimizeSkewedJoin.scala         | 62 +++++++++++++++----
 .../adaptive/AdaptiveQueryExecSuite.scala     |  4 +-
 3 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 6db1fb3829309..4e86fe4590b05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -432,19 +432,13 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
-  val ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD =
-    buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionSizeThreshold")
-      .doc("Configures the minimum size in bytes for a partition that is considered as a skewed " +
-        "partition in adaptive skewed join.")
-      .bytesConf(ByteUnit.BYTE)
-      .createWithDefaultString("64MB")
-
   val ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR =
     buildConf("spark.sql.adaptive.skewedJoinOptimization.skewedPartitionFactor")
       .doc("A partition is considered as a skewed partition if its size is larger than" +
         " this factor multiple the median partition size and also larger than " +
-        s" ${ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key}")
+        s" ${SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key}")
       .intConf
+      .checkValue(_ > 0, "The skew factor must be positive.")
       .createWithDefault(10)
 
   val NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 578d2d744f85c..d3cb8645fc7ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -34,6 +34,30 @@ import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExcha
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.internal.SQLConf
 
+/**
+ * A rule to optimize skewed joins to avoid straggler tasks whose share of data are significantly
+ * larger than those of the rest of the tasks.
+ *
+ * The general idea is to divide each skew partition into smaller partitions and replicate its
+ * matching partition on the other side of the join so that they can run in parallel tasks.
+ * Note that when matching partitions from the left side and the right side both have skew,
+ * it will become a cartesian product of splits from left and right joining together.
+ *
+ * For example, assume the Sort-Merge join has 4 partitions:
+ * left:  [L1, L2, L3, L4]
+ * right: [R1, R2, R3, R4]
+ *
+ * Let's say L2, L4 and R3, R4 are skewed, and each of them get split into 2 sub-partitions. This
+ * is scheduled to run 4 tasks at the beginning: (L1, R1), (L2, R2), (L2, R2), (L2, R2).
+ * This rule expands it to 9 tasks to increase parallelism:
+ * (L1, R1),
+ * (L2-1, R2), (L2-2, R2),
+ * (L3, R3-1), (L3, R3-2),
+ * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2)
+ *
+ * Note that, when this rule is enabled, it also coalesces non-skewed partitions like
+ * `ReduceNumShufflePartitions` does.
+ */
 case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
 
   private val ensureRequirements = EnsureRequirements(conf)
@@ -43,12 +67,12 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
 
   /**
    * A partition is considered as a skewed partition if its size is larger than the median
-   * partition size * spark.sql.adaptive.skewedPartitionFactor and also larger than
-   * spark.sql.adaptive.skewedPartitionSizeThreshold.
+   * partition size * ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR and also larger than
+   * SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.
    */
   private def isSkewed(size: Long, medianSize: Long): Boolean = {
     size > medianSize * conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR) &&
-      size > conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD)
+      size > conf.getConf(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE)
   }
 
   private def medianSize(stats: MapOutputStatistics): Long = {
@@ -61,6 +85,19 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
     }
   }
 
+  /**
+   * The goal of skew join optimization is to make the data distribution more even. The target size
+   * to split skewed partitions is the average size of non-skewed partition, or the
+   * target post-shuffle partition size if avg size is smaller than it.
+   */
+  private def targetSize(stats: MapOutputStatistics, medianSize: Long): Long = {
+    val targetPostShuffleSize = conf.getConf(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE)
+    val nonSkewSizes = stats.bytesByPartitionId.filterNot(isSkewed(_, medianSize))
+    // It's impossible that all the partitions are skewed, as we use median size to define skew.
+    assert(nonSkewSizes.nonEmpty)
+    math.max(targetPostShuffleSize, nonSkewSizes.sum / nonSkewSizes.length)
+  }
+
   /**
    * Get the map size of the specific reduce shuffle Id.
    */
@@ -72,19 +109,19 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
   /**
    * Split the skewed partition based on the map size and the max split number.
    */
-  private def getMapStartIndices(stage: ShuffleQueryStageExec, partitionId: Int): Array[Int] = {
+  private def getMapStartIndices(
+      stage: ShuffleQueryStageExec,
+      partitionId: Int,
+      targetSize: Long): Array[Int] = {
     val shuffleId = stage.shuffle.shuffleDependency.shuffleHandle.shuffleId
     val mapPartitionSizes = getMapSizesForReduceId(shuffleId, partitionId)
-    val avgPartitionSize = mapPartitionSizes.sum / mapPartitionSizes.length
-    val advisoryPartitionSize = math.max(avgPartitionSize,
-      conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD))
     val partitionStartIndices = ArrayBuffer[Int]()
     partitionStartIndices += 0
     var i = 0
     var postMapPartitionSize = 0L
     while (i < mapPartitionSizes.length) {
       val nextMapPartitionSize = mapPartitionSizes(i)
-      if (i > 0 && postMapPartitionSize + nextMapPartitionSize > advisoryPartitionSize) {
+      if (i > 0 && postMapPartitionSize + nextMapPartitionSize > targetSize) {
         partitionStartIndices += i
         postMapPartitionSize = nextMapPartitionSize
       } else {
@@ -152,6 +189,8 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
         """.stripMargin)
       val canSplitLeft = canSplitLeftSide(joinType)
       val canSplitRight = canSplitRightSide(joinType)
+      val leftTargetSize = targetSize(leftStats, leftMedSize)
+      val rightTargetSize = targetSize(rightStats, rightMedSize)
 
       val leftSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec]
       val rightSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec]
@@ -179,7 +218,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
             leftSkewDesc.addPartitionSize(leftSize)
             createSkewPartitions(
               partitionIndex,
-              getMapStartIndices(left, partitionIndex),
+              getMapStartIndices(left, partitionIndex, leftTargetSize),
               getNumMappers(left))
           } else {
             Seq(SinglePartitionSpec(partitionIndex))
@@ -189,7 +228,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
             rightSkewDesc.addPartitionSize(rightSize)
             createSkewPartitions(
               partitionIndex,
-              getMapStartIndices(right, partitionIndex),
+              getMapStartIndices(right, partitionIndex, rightTargetSize),
               getNumMappers(right))
           } else {
             Seq(SinglePartitionSpec(partitionIndex))
@@ -236,7 +275,8 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] {
       rightStats: MapOutputStatistics,
       nonSkewPartitionIndices: Seq[Int]): Seq[ShufflePartitionSpec] = {
     assert(nonSkewPartitionIndices.nonEmpty)
-    if (nonSkewPartitionIndices.length == 1) {
+    val shouldCoalesce = conf.getConf(SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED)
+    if (!shouldCoalesce || nonSkewPartitionIndices.length == 1) {
       Seq(SinglePartitionSpec(nonSkewPartitionIndices.head))
     } else {
       val startIndices = ShufflePartitionsCoalescer.coalescePartitions(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 379e17f01d6be..64566af332afb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -583,7 +583,6 @@ class AdaptiveQueryExecSuite
     withSQLConf(
       SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100",
       SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") {
       withTempView("skewData1", "skewData2") {
         spark
@@ -609,8 +608,7 @@ class AdaptiveQueryExecSuite
     withSQLConf(
       SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
-      SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "2000",
-      SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") {
+      SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "2000") {
       withTempView("skewData1", "skewData2") {
         spark
           .range(0, 1000, 1, 10)

From 02f8165343fb5c6fc4e5a1874252abdfe886b5b2 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Tue, 25 Feb 2020 15:17:16 -0800
Subject: [PATCH 147/185] [SPARK-30481][DOCS][FOLLOWUP] Document event log
 compaction into new section of monitoring.md

### What changes were proposed in this pull request?

This is a FOLLOW-UP PR for review comment on #27208 : https://github.com/apache/spark/pull/27208#pullrequestreview-347451714

This PR documents a new feature `Eventlog Compaction` into the new section of `monitoring.md`, as it only has one configuration on the SHS side and it's hard to explain everything on the description on the single configuration.

### Why are the changes needed?

Event log compaction lacks the documentation for what it is and how it helps. This PR will explain it.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Built docs via jekyll.

> change on the new section

<img width="951" alt="Screen Shot 2020-02-16 at 2 23 18 PM" src="https://user-images.githubusercontent.com/1317309/74599587-eb9efa80-50c7-11ea-942c-f7744268e40b.png">

> change on the table

<img width="1126" alt="Screen Shot 2020-01-30 at 5 08 12 PM" src="https://user-images.githubusercontent.com/1317309/73431190-2e9c6680-4383-11ea-8ce0-815f10917ddd.png">

Closes #27398 from HeartSaVioR/SPARK-30481-FOLLOWUP-document-new-feature.

Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 docs/monitoring.md | 57 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/docs/monitoring.md b/docs/monitoring.md
index c30aa99679399..4cba15b35e11d 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -95,6 +95,48 @@ The history server can be configured as follows:
   </tr>
 </table>
 
+### Applying compaction on rolling event log files
+
+A long-running application (e.g. streaming) can bring a huge single event log file which may cost a lot to maintain and
+also requires a bunch of resource to replay per each update in Spark History Server.
+
+Enabling <code>spark.eventLog.rolling.enabled</code> and <code>spark.eventLog.rolling.maxFileSize</code> would
+let you have rolling event log files instead of single huge event log file which may help some scenarios on its own,
+but it still doesn't help you reducing the overall size of logs.
+
+Spark History Server can apply compaction on the rolling event log files to reduce the overall size of
+logs, via setting the configuration <code>spark.history.fs.eventLog.rolling.maxFilesToRetain</code> on the
+Spark History Server.
+
+Details will be described below, but please note in prior that compaction is LOSSY operation.
+Compaction will discard some events which will be no longer seen on UI - you may want to check which events will be discarded
+before enabling the option.
+
+When the compaction happens, the History Server lists all the available event log files for the application, and considers
+the event log files having less index than the file with smallest index which will be retained as target of compaction.
+For example, if the application A has 5 event log files and <code>spark.history.fs.eventLog.rolling.maxFilesToRetain</code> is set to 2, then first 3 log files will be selected to be compacted.
+
+Once it selects the target, it analyzes them to figure out which events can be excluded, and rewrites them
+into one compact file with discarding events which are decided to exclude.
+
+The compaction tries to exclude the events which point to the outdated data. As of now, below describes the candidates of events to be excluded:
+
+* Events for the job which is finished, and related stage/tasks events
+* Events for the executor which is terminated
+* Events for the SQL execution which is finished, and related job/stage/tasks events
+
+Once rewriting is done, original log files will be deleted, via best-effort manner. The History Server may not be able to delete
+the original log files, but it will not affect the operation of the History Server.
+
+Please note that Spark History Server may not compact the old event log files if figures out not a lot of space
+would be reduced during compaction. For streaming query we normally expect compaction
+will run as each micro-batch will trigger one or more jobs which will be finished shortly, but compaction won't run
+in many cases for batch query.
+
+Please also note that this is a new feature introduced in Spark 3.0, and may not be completely stable. Under some circumstances,
+the compaction may exclude more events than you expect, leading some UI issues on History Server for the application.
+Use it with caution.
+
 ### Spark History Server Configuration Options
 
 Security options for the Spark History Server are covered more detail in the
@@ -303,19 +345,8 @@ Security options for the Spark History Server are covered more detail in the
     <td>Int.MaxValue</td>
     <td>
       The maximum number of event log files which will be retained as non-compacted. By default,
-      all event log files will be retained.<br/>
-      Please note that compaction will happen in Spark History Server, which means this configuration
-      should be set to the configuration of Spark History server, and the same value will be applied
-      across applications which are being loaded in Spark History Server. This also means compaction
-      and cleanup would require running Spark History Server.<br/>
-      Please set the configuration in Spark History Server, and <code>spark.eventLog.rolling.maxFileSize</code>
-      in each application accordingly if you want to control the overall size of event log files.
-      The event log files older than these retained files will be compacted into single file and
-      deleted afterwards.<br/>
-      NOTE: Spark History Server may not compact the old event log files if it figures
-      out not a lot of space would be reduced during compaction. For streaming query
-      (including Structured Streaming) we normally expect compaction will run, but for
-      batch query compaction won't run in many cases.
+      all event log files will be retained. The lowest value is 1 for technical reason.<br/>
+      Please read the section of "Applying compaction of old event log files" for more details.
     </td>
   </tr>
 </table>

From 9ea6c0a8975a1277abba799b51aca4e2818c23e7 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Tue, 25 Feb 2020 15:29:36 -0800
Subject: [PATCH 148/185] [SPARK-30943][SS] Show "batch ID" in tool tip string
 for Structured Streaming UI graphs

### What changes were proposed in this pull request?

This patch changes the tool tip string in Structured Streaming UI graphs to show batch ID (and timestamp as well) instead of only showing timestamp, which was a key for DStream but no longer a key for Structured Streaming.

This patch does some refactoring as there're some spots on confusion between js file for streaming and structured streaming.

Note that this patch doesn't actually change the x axis, as once we change it we should decouple the logic for graphs between streaming and structured streaming. It won't change UX meaningfully as in x axis we only show min and max which we still would like to know about "time" as well as batch ID.

### Why are the changes needed?

In Structured Streaming, everything is aligned for "batch ID" where the UI is only showing timestamp - end users have to manually find and correlate batch ID and the timestamp which is clearly a huge pain.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Manually tested. Screenshots:

![Screen Shot 2020-02-25 at 7 22 38 AM](https://user-images.githubusercontent.com/1317309/75197701-40b2ce80-57a2-11ea-9578-c2eb2d1091de.png)
![Screen Shot 2020-02-25 at 7 22 44 AM](https://user-images.githubusercontent.com/1317309/75197704-427c9200-57a2-11ea-9439-e0a8303d0860.png)
![Screen Shot 2020-02-25 at 7 22 58 AM](https://user-images.githubusercontent.com/1317309/75197706-43152880-57a2-11ea-9617-1276c3ba181e.png)
![Screen Shot 2020-02-25 at 7 23 04 AM](https://user-images.githubusercontent.com/1317309/75197708-43152880-57a2-11ea-9de2-7d37eaf88102.png)
![Screen Shot 2020-02-25 at 7 23 31 AM](https://user-images.githubusercontent.com/1317309/75197710-43adbf00-57a2-11ea-9ae4-4e292de39c36.png)

Closes #27687 from HeartSaVioR/SPARK-30943.

Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com>
Signed-off-by: Shixiong Zhu <zsxwing@gmail.com>
---
 .../apache/spark/ui/static/streaming-page.js  |  2 +-
 .../ui/static/structured-streaming-page.js    |  4 +--
 .../ui/StreamingQueryStatisticsPage.scala     | 36 +++++++++++++++----
 .../spark/streaming/ui/StreamingPage.scala    | 13 ++++++-
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js
index 5b75bc3011b6d..ed3e65c386dce 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js
@@ -171,7 +171,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
             .attr("cy", function(d) { return y(d.y); })
             .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";})
             .on('mouseover', function(d) {
-                var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x];
+                var tip = formatYValue(d.y) + " " + unitY + " at " + timeTipStrings[d.x];
                 showBootstrapTooltip(d3.select(this).node(), tip);
                 // show the point
                 d3.select(this)
diff --git a/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js
index 70250fdbd2d0c..c92226b408b6c 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js
@@ -106,12 +106,12 @@ function drawAreaStack(id, labels, values, minX, maxX, minY, maxY) {
         .on('mouseover', function(d) {
             var tip = '';
             var idx = 0;
-            var _values = timeToValues[d._x]
+            var _values = formattedTimeToValues[d._x];
             _values.forEach(function (k) {
                 tip += labels[idx] + ': ' + k + '   ';
                 idx += 1;
             });
-            tip += " at " + d._x
+            tip += " at " + formattedTimeTipStrings[d._x];
             showBootstrapTooltip(d3.select(this).node(), tip);
         })
         .on('mouseout',  function() {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
index 56672ce328bff..fa9896e03f3fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
@@ -70,11 +70,30 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
     <script>{Unparsed(js)}</script>
   }
 
-  def generateVar(values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = {
+  def generateTimeTipStrings(values: Array[(Long, Long)]): Seq[Node] = {
+    val js = "var timeTipStrings = {};\n" + values.map { case (batchId, time) =>
+      val formattedTime = SparkUIUtils.formatBatchTime(time, 1, showYYYYMMSS = false)
+      s"timeTipStrings[$time] = 'batch $batchId ($formattedTime)';"
+    }.mkString("\n")
+
+    <script>{Unparsed(js)}</script>
+  }
+
+  def generateFormattedTimeTipStrings(values: Array[(Long, Long)]): Seq[Node] = {
+    val js = "var formattedTimeTipStrings = {};\n" + values.map { case (batchId, time) =>
+      val formattedTime = SparkUIUtils.formatBatchTime(time, 1, showYYYYMMSS = false)
+      s"""formattedTimeTipStrings["$formattedTime"] = 'batch $batchId ($formattedTime)';"""
+    }.mkString("\n")
+
+    <script>{Unparsed(js)}</script>
+  }
+
+  def generateTimeToValues(values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = {
     val durationDataPadding = SparkUIUtils.durationDataPadding(values)
-    val js = "var timeToValues = {};\n" + durationDataPadding.map { case (x, y) =>
+    val js = "var formattedTimeToValues = {};\n" + durationDataPadding.map { case (x, y) =>
       val s = y.toSeq.sortBy(_._1).map(e => s""""${e._2}"""").mkString("[", ",", "]")
-      s"""timeToValues["${SparkUIUtils.formatBatchTime(x, 1, showYYYYMMSS = false)}"] = $s;"""
+      val formattedTime = SparkUIUtils.formatBatchTime(x, 1, showYYYYMMSS = false)
+      s"""formattedTimeToValues["$formattedTime"] = $s;"""
     }.mkString("\n")
 
     <script>{Unparsed(js)}</script>
@@ -112,8 +131,10 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
   }
 
   def generateStatTable(query: StreamingQueryUIData): Seq[Node] = {
-    val batchTimes = withNoProgress(query,
-      query.recentProgress.map(p => df.parse(p.timestamp).getTime), Array.empty[Long])
+    val batchToTimestamps = withNoProgress(query,
+      query.recentProgress.map(p => (p.batchId, df.parse(p.timestamp).getTime)),
+      Array.empty[(Long, Long)])
+    val batchTimes = batchToTimestamps.map(_._2)
     val minBatchTime =
       withNoProgress(query, df.parse(query.recentProgress.head.timestamp).getTime, 0L)
     val maxBatchTime =
@@ -266,6 +287,9 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
       </table>
     // scalastyle:on
 
-    generateVar(operationDurationData) ++ generateTimeMap(batchTimes) ++ table ++ jsCollector.toHtml
+    generateTimeToValues(operationDurationData) ++
+      generateFormattedTimeTipStrings(batchToTimestamps) ++
+      generateTimeMap(batchTimes) ++ generateTimeTipStrings(batchToTimestamps) ++
+      table ++ jsCollector.toHtml
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
index d47287b6077f8..8596aa035391d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
@@ -140,6 +140,16 @@ private[ui] class StreamingPage(parent: StreamingTab)
     <script>{Unparsed(js)}</script>
   }
 
+  private def generateTimeTipStrings(times: Seq[Long]): Seq[Node] = {
+    // We leverage timeFormat as the value would be same as timeFormat. This means it is
+    // sensitive to the order - generateTimeMap should be called earlier than this.
+    val js = "var timeTipStrings = {};\n" + times.map { time =>
+      s"timeTipStrings[$time] = timeFormat[$time];"
+    }.mkString("\n")
+
+    <script>{Unparsed(js)}</script>
+  }
+
   private def generateStatTable(): Seq[Node] = {
     val batches = listener.retainedBatches
 
@@ -313,7 +323,8 @@ private[ui] class StreamingPage(parent: StreamingTab)
     </table>
     // scalastyle:on
 
-    generateTimeMap(batchTimes) ++ table ++ jsCollector.toHtml
+    generateTimeMap(batchTimes) ++ generateTimeTipStrings(batchTimes) ++ table ++
+      jsCollector.toHtml
   }
 
   private def generateInputDStreamsTable(

From e9fd52282e4ed4831c5922348b0e1ee71e045b4b Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Wed, 26 Feb 2020 11:55:05 +0900
Subject: [PATCH 149/185] [SPARK-30689][CORE][FOLLOW-UP] Rename config name of
 discovery plugin

### What changes were proposed in this pull request?

Rename config `spark.resources.discovery.plugin` to `spark.resources.discoveryPlugin`.

Also, as a side minor change: labeled `ResourceDiscoveryScriptPlugin` as `DeveloperApi` since it's not for end user.

### Why are the changes needed?

Discovery plugin doesn't need to reserve the "discovery" namespace here and it's more consistent with the interface name `ResourceDiscoveryPlugin` if we use `discoveryPlugin` instead.

### Does this PR introduce any user-facing change?

No, it's newly added in Spark3.0.

### How was this patch tested?

Pass Jenkins.

Closes #27689 from Ngone51/spark_30689_followup.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../main/scala/org/apache/spark/internal/config/package.scala   | 2 +-
 .../apache/spark/resource/ResourceDiscoveryScriptPlugin.scala   | 2 ++
 docs/configuration.md                                           | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 3f36e61fe1127..37ce178407381 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -55,7 +55,7 @@ package object config {
       .createOptional
 
   private[spark] val RESOURCES_DISCOVERY_PLUGIN =
-    ConfigBuilder("spark.resources.discovery.plugin")
+    ConfigBuilder("spark.resources.discoveryPlugin")
       .doc("Comma-separated list of class names implementing" +
         "org.apache.spark.api.resource.ResourceDiscoveryPlugin to load into the application." +
         "This is for advanced users to replace the resource discovery class with a " +
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala
index 2ac6d3c500f9d..7027d1e3511b5 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.util.Optional
 
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.resource.ResourceDiscoveryPlugin
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils.executeAndGetOutput
@@ -32,6 +33,7 @@ import org.apache.spark.util.Utils.executeAndGetOutput
  * If the user specifies custom plugins, this is the last one to be executed and
  * throws if the resource isn't discovered.
  */
+@DeveloperApi
 class ResourceDiscoveryScriptPlugin extends ResourceDiscoveryPlugin with Logging {
   override def discoverResource(
       request: ResourceRequest,
diff --git a/docs/configuration.md b/docs/configuration.md
index 86ec35b411578..85f0811d33520 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -244,7 +244,7 @@ of the most common options to set are:
   </td>
 </tr>
 <tr>
- <td><code>spark.resources.discovery.plugin</code></td>
+ <td><code>spark.resources.discoveryPlugin</code></td>
   <td>org.apache.spark.resource.ResourceDiscoveryScriptPlugin</td>
   <td>
     Comma-separated list of class names implementing

From 28b8713036ff5ced4814d4d16d341f00b11ad3c3 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 25 Feb 2020 19:44:31 -0800
Subject: [PATCH 150/185] [SPARK-30950][BUILD] Setting version to
 3.1.0-SNAPSHOT

### What changes were proposed in this pull request?
This patch is to bump the master branch version to 3.1.0-SNAPSHOT.

### Why are the changes needed?
N/A

### Does this PR introduce any user-facing change?
N/A

### How was this patch tested?
N/A

Closes #27698 from gatorsmile/updateVersion.

Authored-by: gatorsmile <gatorsmile@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 R/pkg/DESCRIPTION                                      | 2 +-
 assembly/pom.xml                                       | 2 +-
 common/kvstore/pom.xml                                 | 2 +-
 common/network-common/pom.xml                          | 2 +-
 common/network-shuffle/pom.xml                         | 2 +-
 common/network-yarn/pom.xml                            | 2 +-
 common/sketch/pom.xml                                  | 2 +-
 common/tags/pom.xml                                    | 2 +-
 common/unsafe/pom.xml                                  | 2 +-
 core/pom.xml                                           | 2 +-
 docs/_config.yml                                       | 4 ++--
 examples/pom.xml                                       | 2 +-
 external/avro/pom.xml                                  | 2 +-
 external/docker-integration-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml                   | 2 +-
 external/kafka-0-10-sql/pom.xml                        | 2 +-
 external/kafka-0-10-token-provider/pom.xml             | 2 +-
 external/kafka-0-10/pom.xml                            | 2 +-
 external/kinesis-asl-assembly/pom.xml                  | 2 +-
 external/kinesis-asl/pom.xml                           | 2 +-
 external/spark-ganglia-lgpl/pom.xml                    | 2 +-
 graphx/pom.xml                                         | 2 +-
 hadoop-cloud/pom.xml                                   | 2 +-
 launcher/pom.xml                                       | 2 +-
 mllib-local/pom.xml                                    | 2 +-
 mllib/pom.xml                                          | 2 +-
 pom.xml                                                | 2 +-
 project/MimaExcludes.scala                             | 5 +++++
 python/pyspark/version.py                              | 2 +-
 repl/pom.xml                                           | 2 +-
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 2 +-
 resource-managers/mesos/pom.xml                        | 2 +-
 resource-managers/yarn/pom.xml                         | 2 +-
 sql/catalyst/pom.xml                                   | 2 +-
 sql/core/pom.xml                                       | 2 +-
 sql/hive-thriftserver/pom.xml                          | 2 +-
 sql/hive/pom.xml                                       | 2 +-
 streaming/pom.xml                                      | 2 +-
 tools/pom.xml                                          | 2 +-
 40 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index c8cb1c3a992ad..aa51b98fa3c58 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 3.0.0
+Version: 3.1.0
 Title: R Front End for 'Apache Spark'
 Description: Provides an R Front end for 'Apache Spark' <https://spark.apache.org>.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 193ad3d671bcf..d17abe857ade5 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index a1c8a8e6582eb..39cdc6d6d6cd3 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 163c250054e4d..9d5bc9aae0719 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index a6d99813a8501..00f1defbb0093 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 76a402bb2bd31..0225db81925c5 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 3c3c0d2d96a1c..72a2c4ceb43b6 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 883b73a69c9de..ea16dadca40cb 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 93a4f67fd23f2..769e2518b1fd4 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 11faf6b1b0e38..b0f68880f1d8a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index a888620139207..f82394ed63694 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 3.0.0-SNAPSHOT
-SPARK_VERSION_SHORT: 3.0.0
+SPARK_VERSION: 3.1.0-SNAPSHOT
+SPARK_VERSION_SHORT: 3.1.0
 SCALA_BINARY_VERSION: "2.12"
 SCALA_VERSION: "2.12.10"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index a099f1e042e99..c5bf2409964bb 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/avro/pom.xml b/external/avro/pom.xml
index ba6f20bfdbf58..98036846eb2a8 100644
--- a/external/avro/pom.xml
+++ b/external/avro/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index aff79b8b8e642..d3b1399c99655 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index f2dcf5d217a89..d9d9fb7f55c77 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 693820da6af6b..26707e1a064d6 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-token-provider/pom.xml b/external/kafka-0-10-token-provider/pom.xml
index 01ca96b4f8f40..941946f30e96f 100644
--- a/external/kafka-0-10-token-provider/pom.xml
+++ b/external/kafka-0-10-token-provider/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index ad7a8b7e23f1d..666d9416b245e 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 0ce922349ea66..76ee5bb7b2f85 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 7d69764b77de7..7e80bd28c19e8 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index db64b201abc2c..728b489da6785 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 444568a03d6c7..38836db01553a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 02734e82ed26e..8689e0b8a9ea8 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index e75e8345cd51d..a2550ac939e83 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 2eab868ac0dc8..851af8d52a3ee 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 11769ef548d7c..9eacf380e17f2 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index b3750e4c3f442..bad24ef17fbfd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.12</artifactId>
-  <version>3.0.0-SNAPSHOT</version>
+  <version>3.1.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 289ccb2cf3664..cb350209eb4bc 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -34,6 +34,10 @@ import com.typesafe.tools.mima.core.ProblemFilters._
  */
 object MimaExcludes {
 
+  // Exclude rules for 3.1.x
+  lazy val v31excludes = v30excludes ++ Seq(
+  )
+
   // Exclude rules for 3.0.x
   lazy val v30excludes = v24excludes ++ Seq(
     // [SPARK-29306] Add support for Stage level scheduling for executors
@@ -1711,6 +1715,7 @@ object MimaExcludes {
   }
 
   def excludes(version: String) = version match {
+    case v if v.startsWith("3.1") => v31excludes
     case v if v.startsWith("3.0") => v30excludes
     case v if v.startsWith("2.4") => v24excludes
     case v if v.startsWith("2.3") => v23excludes
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 1abc41279ebe8..e8da19fc44185 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "3.0.0.dev0"
+__version__ = "3.1.0.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 3aee53cc9599a..a1079e7a6fe6a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index f8c6b38225559..18793defee1be 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 369dfd491826c..a522e87c4a4f8 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 107ba365aa775..54a8d66ea1ad6 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 0e5df14e060da..b84180abfa200 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 304e3a7457fcc..9edbb7fec97d0 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 0e664eca6a820..6cdc63b56b298 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 75c7f77942396..1de2677d5ede5 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index c37582386347b..474c6066ed040 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 87af6388e1118..ea351d449481a 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index e380e869f55c7..6e806413ef261 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.12</artifactId>
-    <version>3.0.0-SNAPSHOT</version>
+    <version>3.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 020b2622e597458b925d7227ed5f9fa269f2d391 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Wed, 26 Feb 2020 20:48:43 +0900
Subject: [PATCH 151/185] [SPARK-30798][SQL][TESTS][FOLLOW-UP] Set the
 configuration against the current session explicitly in
 HiveShowCreateTableSuite

### What changes were proposed in this pull request?

After https://github.com/apache/spark/pull/27387 (see https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/202/), the tests below fail consistently, specifically in one job https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/ in Jenkins

```
org.apache.spark.sql.hive.HiveShowCreateTableSuite.simple hive table
org.apache.spark.sql.hive.HiveShowCreateTableSuite.simple external hive table
org.apache.spark.sql.hive.HiveShowCreateTableSuite.hive bucketing is supported
```

The profile is same as PR builder but seems it fails specifically in this machine. Seems the legacy configuration `spark.sql.legacy.createHiveTableByDefault.enabled` is not being set due to the inconsistency between `SQLConf.get` and the active Spark session as described in the https://github.com/apache/spark/pull/27387.

This PR proposes to explicitly set the configuration against the session used instead of `SQLConf.get`.

### Why are the changes needed?

To make `spark-master-test-sbt-hadoop-2.7-hive-2.3` job pass.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Cannot reproduce in my local. Presumably it cannot be reproduced in the PR builder. We should see if the tests pass at `spark-master-test-sbt-hadoop-2.7-hive-2.3` job after this PR is merged

Closes #27703 from HyukjinKwon/SPARK-30798-followup.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/sql/hive/HiveShowCreateTableSuite.scala | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
index 99db1e3cbd18c..50c9018e4e2a9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{AnalysisException, ShowCreateTableSuite}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 
@@ -30,12 +30,13 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet
   protected override def beforeAll(): Unit = {
     super.beforeAll()
     origCreateHiveTableConfig =
-      SQLConf.get.getConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED)
-    SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, true)
+      spark.conf.get(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED)
+    spark.conf.set(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key, true)
   }
 
   protected override def afterAll(): Unit = {
-    SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED,
+    spark.conf.set(
+      SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key,
       origCreateHiveTableConfig)
     super.afterAll()
   }

From 73305475c10f1218bd2060e8575ab4072d0cc50b Mon Sep 17 00:00:00 2001
From: Terry Kim <yuminkim@gmail.com>
Date: Thu, 27 Feb 2020 00:21:38 +0800
Subject: [PATCH 152/185] [SPARK-30782][SQL] Column resolution doesn't respect
 current catalog/namespace for v2 tables

### What changes were proposed in this pull request?

This PR proposes to fix an issue where qualified columns are not matched for v2 tables if current catalog/namespace are used.

For v1 tables, you can currently perform the following:
```SQL
SELECT default.t.id FROM t;
```

For v2 tables, the following fails:
```SQL
USE testcat.ns1.ns2;
SELECT testcat.ns1.ns2.t.id FROM t;

org.apache.spark.sql.AnalysisException: cannot resolve '`testcat.ns1.ns2.t.id`' given input columns: [t.id, t.point]; line 1 pos 7;
```

### Why are the changes needed?

It is a bug since qualified column names cannot match if current catalog/namespace are used.

### Does this PR introduce any user-facing change?

Yes, now the following works:
```SQL
USE testcat.ns1.ns2;
SELECT testcat.ns1.ns2.t.id FROM t;
```

### How was this patch tested?

Added new tests

Closes #27532 from imback82/qualifed_col_respect_current.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  8 +--
 .../sql/connector/DataSourceV2SQLSuite.scala  | 54 +++++++++++++------
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index aec71747fde68..3d797994e77e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -807,8 +807,10 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = ResolveTempViews(plan).resolveOperatorsUp {
       case u: UnresolvedRelation =>
         lookupV2Relation(u.multipartIdentifier)
-          .map(SubqueryAlias(u.multipartIdentifier, _))
-          .getOrElse(u)
+          .map { rel =>
+            val ident = rel.identifier.get
+            SubqueryAlias(rel.catalog.get.name +: ident.namespace :+ ident.name, rel)
+          }.getOrElse(u)
 
       case u @ UnresolvedTable(NonSessionCatalogAndIdentifier(catalog, ident)) =>
         CatalogV2Util.loadTable(catalog, ident)
@@ -933,7 +935,7 @@ class Analyzer(
               v1SessionCatalog.getRelation(v1Table.v1Table)
             case table =>
               SubqueryAlias(
-                identifier,
+                ident.asMultipartIdentifier,
                 DataSourceV2Relation.create(table, Some(catalog), Some(ident)))
           }
           val key = catalog.name +: ident.namespace :+ ident.name
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index f642114983293..4ff20939b3dfd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -685,12 +685,21 @@ class DataSourceV2SQLSuite
       sql(s"CREATE TABLE $t (id bigint, point struct<x: bigint, y: bigint>) USING foo")
       sql(s"INSERT INTO $t VALUES (1, (10, 20))")
 
-      checkAnswer(
-        sql(s"SELECT testcat.ns1.ns2.tbl.id, testcat.ns1.ns2.tbl.point.x FROM $t"),
-        Row(1, 10))
-      checkAnswer(sql(s"SELECT ns1.ns2.tbl.id, ns1.ns2.tbl.point.x FROM $t"), Row(1, 10))
-      checkAnswer(sql(s"SELECT ns2.tbl.id, ns2.tbl.point.x FROM $t"), Row(1, 10))
-      checkAnswer(sql(s"SELECT tbl.id, tbl.point.x FROM $t"), Row(1, 10))
+      def check(tbl: String): Unit = {
+        checkAnswer(
+          sql(s"SELECT testcat.ns1.ns2.tbl.id, testcat.ns1.ns2.tbl.point.x FROM $tbl"),
+          Row(1, 10))
+        checkAnswer(sql(s"SELECT ns1.ns2.tbl.id, ns1.ns2.tbl.point.x FROM $tbl"), Row(1, 10))
+        checkAnswer(sql(s"SELECT ns2.tbl.id, ns2.tbl.point.x FROM $tbl"), Row(1, 10))
+        checkAnswer(sql(s"SELECT tbl.id, tbl.point.x FROM $tbl"), Row(1, 10))
+      }
+
+      // Test with qualified table name "testcat.ns1.ns2.tbl".
+      check(t)
+
+      // Test if current catalog and namespace is respected in column resolution.
+      sql("USE testcat.ns1.ns2")
+      check("tbl")
 
       val ex = intercept[AnalysisException] {
         sql(s"SELECT ns1.ns2.ns3.tbl.id from $t")
@@ -700,19 +709,30 @@ class DataSourceV2SQLSuite
   }
 
   test("qualified column names for v1 tables") {
-    // unset this config to use the default v2 session catalog.
-    spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
-
-    withTable("t") {
-      sql("CREATE TABLE t USING json AS SELECT 1 AS i")
-      checkAnswer(sql("select t.i from spark_catalog.default.t"), Row(1))
-      checkAnswer(sql("select default.t.i from spark_catalog.default.t"), Row(1))
+    Seq(true, false).foreach { useV1Table =>
+      val format = if (useV1Table) "json" else v2Format
+      if (useV1Table) {
+        // unset this config to use the default v2 session catalog.
+        spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
+      } else {
+        spark.conf.set(
+          V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName)
+      }
 
-      // catalog name cannot be used for v1 tables.
-      val ex = intercept[AnalysisException] {
-        sql(s"select spark_catalog.default.t.i from spark_catalog.default.t")
+      withTable("t") {
+        sql(s"CREATE TABLE t USING $format AS SELECT 1 AS i")
+        checkAnswer(sql("select i from t"), Row(1))
+        checkAnswer(sql("select t.i from t"), Row(1))
+        checkAnswer(sql("select default.t.i from t"), Row(1))
+        checkAnswer(sql("select t.i from spark_catalog.default.t"), Row(1))
+        checkAnswer(sql("select default.t.i from spark_catalog.default.t"), Row(1))
+
+        // catalog name cannot be used for tables in the session catalog.
+        val ex = intercept[AnalysisException] {
+          sql(s"select spark_catalog.default.t.i from spark_catalog.default.t")
+        }
+        assert(ex.getMessage.contains("cannot resolve '`spark_catalog.default.t.i`"))
       }
-      assert(ex.getMessage.contains("cannot resolve '`spark_catalog.default.t.i`"))
     }
   }
 

From c913b9d8b5c2e125c441f7fe923e333cd8ac2c2c Mon Sep 17 00:00:00 2001
From: iRakson <raksonrakesh@gmail.com>
Date: Thu, 27 Feb 2020 01:48:12 +0800
Subject: [PATCH 153/185] [SPARK-27619][SQL] MapType should be prohibited in
 hash expressions

### What changes were proposed in this pull request?
`hash()` and `xxhash64()` cannot be used on elements of `Maptype`. A new configuration `spark.sql.legacy.useHashOnMapType` is introduced to allow users to restore the previous behaviour.

When `spark.sql.legacy.useHashOnMapType` is set to false:

```
scala> spark.sql("select hash(map())");
org.apache.spark.sql.AnalysisException: cannot resolve 'hash(map())' due to data type mismatch: input to function hash cannot contain elements of MapType; line 1 pos 7;
'Project [unresolvedalias(hash(map(), 42), None)]
+- OneRowRelation
```

when `spark.sql.legacy.useHashOnMapType` is set to true :

```
scala> spark.sql("set spark.sql.legacy.useHashOnMapType=true");
res3: org.apache.spark.sql.DataFrame = [key: string, value: string]

scala> spark.sql("select hash(map())").first()
res4: org.apache.spark.sql.Row = [42]

```

### Why are the changes needed?

As discussed in Jira, SparkSql's map hashcodes depends on their order of insertion which is not consistent with the normal scala behaviour which might confuse users.
Code snippet from JIRA :
```
val a = spark.createDataset(Map(1->1, 2->2) :: Nil)
val b = spark.createDataset(Map(2->2, 1->1) :: Nil)

// Demonstration of how Scala Map equality is unaffected by insertion order:
assert(Map(1->1, 2->2).hashCode() == Map(2->2, 1->1).hashCode())
assert(Map(1->1, 2->2) == Map(2->2, 1->1))
assert(a.first() == b.first())

// In contrast, this will print two different hashcodes:
println(Seq(a, b).map(_.selectExpr("hash(*)").first()))
```

Also `MapType` is prohibited for aggregation / joins / equality comparisons #7819 and set operations #17236.

### Does this PR introduce any user-facing change?
Yes. Now users cannot use hash functions on elements of `mapType`. To restore the previous behaviour set `spark.sql.legacy.useHashOnMapType` to true.

### How was this patch tested?
UT added.

Closes #27580 from iRakson/SPARK-27619.

Authored-by: iRakson <raksonrakesh@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                   |  2 ++
 .../spark/sql/catalyst/expressions/hash.scala | 15 ++++++++++++---
 .../apache/spark/sql/internal/SQLConf.scala   |  6 ++++++
 .../expressions/HashExpressionsSuite.scala    | 14 --------------
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 19 +++++++++++++++++++
 5 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 7e52e6924d9fc..7e0a536d0f75f 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -224,6 +224,8 @@ license: |
 
   - Since Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces (<= ASCII 32) will be trimmed before converted to these type values, e.g. `cast(' 1\t' as int)` results `1`, `cast(' 1\t' as boolean)` results `true`, `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, while casting string to integrals and booleans, it will not trim the whitespaces from both ends, the foregoing results will be `null`, while to datetimes, only the trailing spaces (= ASCII 32) will be removed.
 
+  - Since Spark 3.0, An analysis exception will be thrown when hash expressions are applied on elements of MapType. To restore the behavior before Spark 3.0, set `spark.sql.legacy.useHashOnMapType` to true.
+    
   - Since Spark 3.0, numbers written in scientific notation(e.g. `1E2`) would be parsed as Double. In Spark version 2.4 and earlier, they're parsed as Decimal. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.exponentLiteralAsDecimal.enabled` to `true`.
 
   - Since Spark 3.0, we pad decimal numbers with trailing zeros to the scale of the column for `spark-sql` interface, for example:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index d8eed27df79df..5be7f54107759 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.hash.Murmur3_x86_32
@@ -232,9 +233,6 @@ case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInp
  *  - array:              The `result` starts with seed, then use `result` as seed, recursively
  *                        calculate hash value for each element, and assign the element hash value
  *                        to `result`.
- *  - map:                The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each key-value, and assign the key-value hash
- *                        value to `result`.
  *  - struct:             The `result` starts with seed, then use `result` as seed, recursively
  *                        calculate hash value for each field, and assign the field hash value to
  *                        `result`.
@@ -249,10 +247,21 @@ abstract class HashExpression[E] extends Expression {
 
   override def nullable: Boolean = false
 
+  private def hasMapType(dt: DataType): Boolean = {
+    dt.existsRecursively(_.isInstanceOf[MapType])
+  }
+
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.length < 1) {
       TypeCheckResult.TypeCheckFailure(
         s"input to function $prettyName requires at least one argument")
+    } else if (children.exists(child => hasMapType(child.dataType)) &&
+      !SQLConf.get.getConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE)) {
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName cannot contain elements of MapType. In Spark, same maps " +
+          "may have different hashcode, thus hash expressions are prohibited on MapType " +
+          s"elements. To restore previous behavior set ${SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key} " +
+          "to true.")
     } else {
       TypeCheckResult.TypeCheckSuccess
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4e86fe4590b05..99237585aa6d6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2200,6 +2200,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_USE_HASH_ON_MAPTYPE = buildConf("spark.sql.legacy.useHashOnMapType")
+    .doc("When set to true, hash expressions can be applied on elements of MapType. Otherwise, " +
+      "an analysis exception will be thrown.")
+    .booleanConf
+    .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index e6cf979649c83..68da1faaa8f45 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -554,28 +554,14 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       .add("arrayOfString", arrayOfString)
       .add("arrayOfArrayOfString", ArrayType(arrayOfString))
       .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType)))
-      .add("arrayOfMap", ArrayType(mapOfString))
       .add("arrayOfStruct", ArrayType(structOfString))
       .add("arrayOfUDT", arrayOfUDT))
 
-  testHash(
-    new StructType()
-      .add("mapOfIntAndString", MapType(IntegerType, StringType))
-      .add("mapOfStringAndArray", MapType(StringType, arrayOfString))
-      .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType))
-      .add("mapOfArray", MapType(arrayOfString, arrayOfString))
-      .add("mapOfStringAndStruct", MapType(StringType, structOfString))
-      .add("mapOfStructAndString", MapType(structOfString, StringType))
-      .add("mapOfStruct", MapType(structOfString, structOfString)))
-
   testHash(
     new StructType()
       .add("structOfString", structOfString)
       .add("structOfStructOfString", new StructType().add("struct", structOfString))
       .add("structOfArray", new StructType().add("array", arrayOfString))
-      .add("structOfMap", new StructType().add("map", mapOfString))
-      .add("structOfArrayAndMap",
-        new StructType().add("array", arrayOfString).add("map", mapOfString))
       .add("structOfUDT", structOfUDT))
 
   test("hive-hash for decimal") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c87e6a4e39b10..78ceeaa09c125 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2120,6 +2120,25 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
     }
   }
 
+  test("SPARK-27619: Throw analysis exception when hash and xxhash64 is used on MapType") {
+    Seq("hash", "xxhash64").foreach {
+      case hashExpression =>
+        intercept[AnalysisException] {
+          spark.createDataset(Map(1 -> 10, 2 -> 20) :: Nil).selectExpr(s"$hashExpression(*)")
+        }
+    }
+  }
+
+  test("SPARK-27619: when spark.sql.legacy.useHashOnMapType is true, hash can be used on Maptype") {
+    Seq("hash", "xxhash64").foreach {
+      case hashExpression =>
+        withSQLConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key -> "true") {
+          val df = spark.createDataset(Map() :: Nil)
+          checkAnswer(df.selectExpr(s"$hashExpression(*)"), sql(s"SELECT $hashExpression(map())"))
+        }
+    }
+  }
+
   test("xxhash64 function") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
     withTempView("tbl") {

From eced93215f42d9d579b0480b25e143ae9bf5581a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 26 Feb 2020 13:59:43 -0800
Subject: [PATCH 154/185] [SPARK-30918][SQL][FOLLOWUP] Fix typo in
 OptimizeSkewedJoin

### What changes were proposed in this pull request?

This is a follow up of #27669 in order to fix a typo.

### Why are the changes needed?

N/A

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

N/A

Closes #27714 from cloud-fan/typo.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/execution/adaptive/OptimizeSkewedJoin.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index d3cb8645fc7ea..dd238fe7785bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.internal.SQLConf
  * right: [R1, R2, R3, R4]
  *
  * Let's say L2, L4 and R3, R4 are skewed, and each of them get split into 2 sub-partitions. This
- * is scheduled to run 4 tasks at the beginning: (L1, R1), (L2, R2), (L2, R2), (L2, R2).
+ * is scheduled to run 4 tasks at the beginning: (L1, R1), (L2, R2), (L3, R3), (L4, R4).
  * This rule expands it to 9 tasks to increase parallelism:
  * (L1, R1),
  * (L2-1, R2), (L2-2, R2),

From a6026c830a582af75a0d95d18f7759922a086334 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Wed, 26 Feb 2020 14:40:32 -0800
Subject: [PATCH 155/185] [MINOR][BUILD] Fix make-distribution.sh to show usage
 without 'echo' cmd

### What changes were proposed in this pull request?

turn off `x` mode and do not print the command while printing usage of `make-distribution.sh`

### Why are the changes needed?

improve dev tools

### Does this PR introduce any user-facing change?

for only developers, clearer hints

#### after
```
./dev/make-distribution.sh --hel
+++ dirname ./dev/make-distribution.sh
++ cd ./dev/..
++ pwd
+ SPARK_HOME=/Users/kentyao/spark
+ DISTDIR=/Users/kentyao/spark/dist
+ MAKE_TGZ=false
+ MAKE_PIP=false
+ MAKE_R=false
+ NAME=none
+ MVN=/Users/kentyao/spark/build/mvn
+ ((  1  ))
+ case $1 in
+ echo 'Error: --hel is not supported'
Error: --hel is not supported
+ exit_with_usage
+ set +x
make-distribution.sh - tool for making binary distributions of Spark

usage:
make-distribution.sh [--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>] <maven build options>
See Spark's "Building Spark" doc for correct Maven options.
```

#### before
```
+++ dirname ./dev/make-distribution.sh
++ cd ./dev/..
++ pwd
+ SPARK_HOME=/Users/kentyao/spark
+ DISTDIR=/Users/kentyao/spark/dist
+ MAKE_TGZ=false
+ MAKE_PIP=false
+ MAKE_R=false
+ NAME=none
+ MVN=/Users/kentyao/spark/build/mvn
+ ((  1  ))
+ case $1 in
+ echo 'Error: --hel is not supported'
Error: --hel is not supported
+ exit_with_usage
+ echo 'make-distribution.sh - tool for making binary distributions of Spark'
make-distribution.sh - tool for making binary distributions of Spark
+ echo ''

+ echo usage:
usage:
+ cl_options='[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]'
+ echo 'make-distribution.sh [--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>] <maven build options>'
make-distribution.sh [--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>] <maven build options>
+ echo 'See Spark'\''s "Building Spark" doc for correct Maven options.'
See Spark's "Building Spark" doc for correct Maven options.
+ echo ''

+ exit 1
```

### How was this patch tested?

manually

Closes #27706 from yaooqinn/build.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 dev/make-distribution.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 0b30eec76bb53..9e767ce5a3daa 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -39,6 +39,7 @@ NAME=none
 MVN="$SPARK_HOME/build/mvn"
 
 function exit_with_usage {
+  set +x
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"

From 2c9db5608d2d65181a690c32bd0c3e83107b2b9c Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 26 Feb 2020 19:45:15 -0600
Subject: [PATCH 156/185] [SPARK-30928][ML] Remove unnecessary MiMa excludes

### What changes were proposed in this pull request?
https://issues.apache.org/jira/browse/SPARK-30928
remove unnecessary MiMa excludes

### Why are the changes needed?

When auditing binary incompatible changes for 3.0, I found several MiMa excludes are not necessary, so remove these.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
run dev/mima to check

Closes #27696 from huaxingao/spark-mima.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 project/MimaExcludes.scala | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index cb350209eb4bc..d4bce82dd4b96 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -51,7 +51,7 @@ object MimaExcludes {
     // [SPARK-29399][core] Remove old ExecutorPlugin interface.
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ExecutorPlugin"),
 
-    // [SPARK-][SQL][CORE][MLLIB] Remove more old deprecated items in Spark 3
+    // [SPARK-28980][SQL][CORE][MLLIB] Remove more old deprecated items in Spark 3
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.SQLContext.createExternalTable"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createExternalTable"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.KMeans.train"),
@@ -83,6 +83,7 @@ object MimaExcludes {
 
     // [SPARK-27090][CORE] Removing old LEGACY_DRIVER_IDENTIFIER ("<driver>")
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkContext.LEGACY_DRIVER_IDENTIFIER"),
+
     // [SPARK-25838] Remove formatVersion from Saveable
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.DistributedLDAModel.formatVersion"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.clustering.LocalLDAModel.formatVersion"),
@@ -235,13 +236,6 @@ object MimaExcludes {
     ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.relativeError"),
     ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.getRelativeError"),
 
-    // [SPARK-25959] GBTClassifier picks wrong impurity stats on loading
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.tree.HasVarianceImpurity.org$apache$spark$ml$tree$HasVarianceImpurity$_setter_$impurity_="),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.tree.HasVarianceImpurity.org$apache$spark$ml$tree$HasVarianceImpurity$_setter_$impurity_="),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.tree.HasVarianceImpurity.org$apache$spark$ml$tree$HasVarianceImpurity$_setter_$impurity_="),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.tree.HasVarianceImpurity.org$apache$spark$ml$tree$HasVarianceImpurity$_setter_$impurity_="),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.tree.HasVarianceImpurity.org$apache$spark$ml$tree$HasVarianceImpurity$_setter_$impurity_="),
-
     // [SPARK-28968][ML] Add HasNumFeatures in the scala side
     ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.FeatureHasher.getNumFeatures"),
     ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.FeatureHasher.numFeatures"),
@@ -320,12 +314,6 @@ object MimaExcludes {
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ml.feature.OneHotEncoderEstimator"),
     ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.OneHotEncoder"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.transform"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.getInputCol"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.getOutputCol"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.inputCol"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.setInputCol"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.setOutputCol"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.OneHotEncoder.outputCol"),
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ml.feature.OneHotEncoderEstimator$"),
 
     // [SPARK-30329][ML] add iterator/foreach methods for Vectors
@@ -457,7 +445,6 @@ object MimaExcludes {
 
     // [SPARK-25382][SQL][PYSPARK] Remove ImageSchema.readImages in 3.0
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.image.ImageSchema.readImages"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.image.ImageSchema.readImages"),
 
     // [SPARK-25341][CORE] Support rolling back a shuffle map stage and re-generate the shuffle files
     ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.shuffle.sort.UnsafeShuffleWriter.this"),

From 776e21af40a1f1351b57ff3290d87ee4a4c4c4df Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 27 Feb 2020 10:56:38 +0900
Subject: [PATCH 157/185] [SPARK-30910][CORE][DOC] Add version information to
 the configuration of R
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
1.Add version information to the configuration of `R`.
2.Update the docs of `R`.

I sorted out some information show below.

Item name | Since version | JIRA ID | Commit ID | Note
-- | -- | -- | -- | --
spark.r.backendConnectionTimeout | 2.1.0 | SPARK-17919 | 2881a2d1d1a650a91df2c6a01275eba14a43b42a#diff-025470e1b7094d7cf4a78ea353fb3981 |  
spark.r.numRBackendThreads | 1.4.0 | SPARK-8282 | 28e8a6ea65fd08ab9cefc4d179d5c66ffefd3eb4#diff-697f7f2fc89808e0113efc71ed235db2 |  
spark.r.heartBeatInterval | 2.1.0 | SPARK-17919 | 2881a2d1d1a650a91df2c6a01275eba14a43b42a#diff-fe903bf14db371aa320b7cc516f2463c |  
spark.sparkr.r.command | 1.5.3 | SPARK-10971 | 9695f452e86a88bef3bcbd1f3c0b00ad9e9ac6e1#diff-025470e1b7094d7cf4a78ea353fb3981 |  
spark.r.command | 1.5.3 | SPARK-10971 | 9695f452e86a88bef3bcbd1f3c0b00ad9e9ac6e1#diff-025470e1b7094d7cf4a78ea353fb3981 |  

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27708 from beliefer/add-version-to-R-config.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../main/scala/org/apache/spark/internal/config/R.scala   | 5 +++++
 docs/configuration.md                                     | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/R.scala b/core/src/main/scala/org/apache/spark/internal/config/R.scala
index 26e06a5231c42..46fc198cd4cf5 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/R.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/R.scala
@@ -19,22 +19,27 @@ package org.apache.spark.internal.config
 private[spark] object R {
 
   val R_BACKEND_CONNECTION_TIMEOUT = ConfigBuilder("spark.r.backendConnectionTimeout")
+    .version("2.1.0")
     .intConf
     .createWithDefault(6000)
 
   val R_NUM_BACKEND_THREADS = ConfigBuilder("spark.r.numRBackendThreads")
+    .version("1.4.0")
     .intConf
     .createWithDefault(2)
 
   val R_HEARTBEAT_INTERVAL = ConfigBuilder("spark.r.heartBeatInterval")
+    .version("2.1.0")
     .intConf
     .createWithDefault(100)
 
   val SPARKR_COMMAND = ConfigBuilder("spark.sparkr.r.command")
+    .version("1.5.3")
     .stringConf
     .createWithDefault("Rscript")
 
   val R_COMMAND = ConfigBuilder("spark.r.command")
+    .version("1.5.3")
     .stringConf
     .createOptional
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 85f0811d33520..4b562e94a715e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2533,13 +2533,14 @@ Spark subsystems.
 ### SparkR
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td><code>spark.r.numRBackendThreads</code></td>
   <td>2</td>
   <td>
     Number of threads used by RBackend to handle RPC calls from SparkR package.
   </td>
+  <td>1.4.0</td>
 </tr>
 <tr>
   <td><code>spark.r.command</code></td>
@@ -2547,6 +2548,7 @@ Spark subsystems.
   <td>
     Executable for executing R scripts in cluster modes for both driver and workers.
   </td>
+  <td>1.5.3</td>
 </tr>
 <tr>
   <td><code>spark.r.driver.command</code></td>
@@ -2554,6 +2556,7 @@ Spark subsystems.
   <td>
     Executable for executing R scripts in client modes for driver. Ignored in cluster modes.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.r.shell.command</code></td>
@@ -2562,6 +2565,7 @@ Spark subsystems.
     Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable <code>SPARKR_DRIVER_R</code>, but take precedence over it.
     <code>spark.r.shell.command</code> is used for sparkR shell while <code>spark.r.driver.command</code> is used for running R script.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.r.backendConnectionTimeout</code></td>
@@ -2569,6 +2573,7 @@ Spark subsystems.
   <td>
     Connection timeout set by R process on its connection to RBackend in seconds.
   </td>
+  <td>2.1.0</td>
 </tr>
 <tr>
   <td><code>spark.r.heartBeatInterval</code></td>
@@ -2576,6 +2581,7 @@ Spark subsystems.
   <td>
     Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.
   </td>
+  <td>2.1.0</td>
 </tr>
 
 </table>

From c2857501d5a1d06e940e48069e162f184bda5460 Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 27 Feb 2020 10:57:34 +0900
Subject: [PATCH 158/185] [SPARK-30909][CORE][DOC] Add version information to
 the configuration of Python
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
1.Add version information to the configuration of `Python`.
2.Update the docs of `Python`.

I sorted out some information show below.

Item name | Since version | JIRA ID | Commit ID | Note
-- | -- | -- | -- | --
spark.python.worker.reuse | 1.2.0 | SPARK-3030 | 2aea0da84c58a179917311290083456dfa043db7#diff-0a67bc4d171abe4df8eb305b0f4123a2 |  
spark.python.task.killTimeout | 2.2.2 | SPARK-22535 | be68f86e11d64209d9e325ce807025318f383bea#diff-0a67bc4d171abe4df8eb305b0f4123a2 |  
spark.python.use.daemon | 2.3.0 | SPARK-22554 | 57c5514de9dba1c14e296f85fb13fef23ce8c73f#diff-9008ad45db34a7eee2e265a50626841b |  
spark.python.daemon.module | 2.4.0 | SPARK-22959 | afae8f2bc82597593595af68d1aa2d802210ea8b#diff-9008ad45db34a7eee2e265a50626841b |  
spark.python.worker.module | 2.4.0 | SPARK-22959 | afae8f2bc82597593595af68d1aa2d802210ea8b#diff-9008ad45db34a7eee2e265a50626841b |  
spark.executor.pyspark.memory | 2.4.0 | SPARK-25004 | 7ad18ee9f26e75dbe038c6034700f9cd4c0e2baa#diff-6bdad48cfc34314e89599655442ff210 |  

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27704 from beliefer/add-version-to-python-config.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../scala/org/apache/spark/internal/config/Python.scala     | 6 ++++++
 docs/configuration.md                                       | 6 ++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/Python.scala b/core/src/main/scala/org/apache/spark/internal/config/Python.scala
index 26a0598f49411..188d884319644 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/Python.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/Python.scala
@@ -22,26 +22,32 @@ import org.apache.spark.network.util.ByteUnit
 
 private[spark] object Python {
   val PYTHON_WORKER_REUSE = ConfigBuilder("spark.python.worker.reuse")
+    .version("1.2.0")
     .booleanConf
     .createWithDefault(true)
 
   val PYTHON_TASK_KILL_TIMEOUT = ConfigBuilder("spark.python.task.killTimeout")
+    .version("2.2.2")
     .timeConf(TimeUnit.MILLISECONDS)
     .createWithDefaultString("2s")
 
   val PYTHON_USE_DAEMON = ConfigBuilder("spark.python.use.daemon")
+    .version("2.3.0")
     .booleanConf
     .createWithDefault(true)
 
   val PYTHON_DAEMON_MODULE = ConfigBuilder("spark.python.daemon.module")
+    .version("2.4.0")
     .stringConf
     .createOptional
 
   val PYTHON_WORKER_MODULE = ConfigBuilder("spark.python.worker.module")
+    .version("2.4.0")
     .stringConf
     .createOptional
 
   val PYSPARK_EXECUTOR_MEMORY = ConfigBuilder("spark.executor.pyspark.memory")
+    .version("2.4.0")
     .bytesConf(ByteUnit.MiB)
     .createOptional
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 4b562e94a715e..f77f2a37f1e15 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -136,7 +136,7 @@ of the most common options to set are:
 ### Application Properties
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td><code>spark.app.name</code></td>
   <td>(none)</td>
@@ -278,6 +278,7 @@ of the most common options to set are:
     limitations are inherited. For instance, Windows does not support resource limiting and actual 
     resource is not limited on MacOS.
   </td>
+  <td>2.4.0</td>
 </tr>
 <tr>
  <td><code>spark.executor.memoryOverhead</code></td>
@@ -440,7 +441,7 @@ Apart from these, the following properties are also available, and may be useful
 ### Runtime Environment
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td><code>spark.driver.extraClassPath</code></td>
   <td>(none)</td>
@@ -679,6 +680,7 @@ Apart from these, the following properties are also available, and may be useful
     if there is a large broadcast, then the broadcast will not need to be transferred
     from JVM to Python worker for every task.
   </td>
+  <td>1.2.0</td>
 </tr>
 <tr>
   <td><code>spark.files</code></td>

From 825d3dc11bd72fa9b1aeeee5f511d938d9ce51c9 Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 27 Feb 2020 10:58:44 +0900
Subject: [PATCH 159/185] [SPARK-30841][SQL][DOC] Add version information to
 the configuration of SQL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
Add version information to the configuration of Spark SQL.
Note: Because SQLConf has a lot of configuration items, I split the items into two PR. Another PR will follows this PR.

I sorted out some information show below.

Item name | Since version | JIRA ID | Commit ID | Note
-- | -- | -- | -- | --
spark.sql.analyzer.maxIterations | 3.0.0 | SPARK-30138 | c2f29d5ea58eb4565cc5602937d6d0bb75558513#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.excludedRules | 2.4.0 | SPARK-24802 | 434319e73f8cb6e080671bdde42a72228bd814ef#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.maxIterations | 2.0.0 | SPARK-14677 | f4be0946af219379fb2476e6f80b2e50463adeb2#diff-32bb9518401c0948c5ea19377b5069ab |  
spark.sql.optimizer.inSetConversionThreshold | 2.0.0 | SPARK-14796 | 3647120a5a879edf3a96a5fd68fb7aa849ad57ef#diff-32bb9518401c0948c5ea19377b5069ab |  
spark.sql.optimizer.inSetSwitchThreshold | 3.0.0 | SPARK-26205 | 0c23a39384b7ae5fb4aeb4f7f6fe72007b84bbd2#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.planChangeLog.level | 3.0.0 | SPARK-25415 | 8b702e1e0aba1d3e4b0aa582f20cf99f80a44a09#diff-9a6b543db706f1a90f790783d6930a13 | This configuration does not exist in branch-2.4 branch, but from the branch-3.0 git log, it is found that the version number of the pom.xml file is 2.4.0-SNAPSHOT
spark.sql.optimizer.planChangeLog.rules | 3.0.0 | SPARK-25415 | 8b702e1e0aba1d3e4b0aa582f20cf99f80a44a09#diff-9a6b543db706f1a90f790783d6930a13 | This configuration does not exist in branch-2.4 branch, but from the branch-3.0 git log, it is found that the version number of the pom.xml file is 2.4.0-SNAPSHOT
spark.sql.optimizer.planChangeLog.batches | 3.0.0 | SPARK-27088 | 074533334d01afdd7862a1ac6c5a7a672bcce3f8#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.dynamicPartitionPruning.enabled | 3.0.0 | SPARK-11150 | a7a3935c97d1fe6060cae42bbc9229c087b648ab#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.dynamicPartitionPruning.useStats | 3.0.0 | SPARK-11150 | a7a3935c97d1fe6060cae42bbc9229c087b648ab#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.dynamicPartitionPruning.fallbackFilterRatio | 3.0.0 | SPARK-11150 | a7a3935c97d1fe6060cae42bbc9229c087b648ab#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcastOnly | 3.0.0 | SPARK-30528 | 59a13c9b7bc3b3aa5b5bc30a60344f849c0f8012#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.inMemoryColumnarStorage.compressed | 1.0.1 | SPARK-2631 | 86534d0f5255362618c05a07b0171ec35c915822#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.inMemoryColumnarStorage.batchSize | 1.1.1 | SPARK-2650 | 779d1eb26d0f031791e93c908d51a59c3b422a55#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.inMemoryColumnarStorage.partitionPruning | 1.2.0 | SPARK-2961 | 248067adbe90f93c7d5e23aa61b3072dfdf48a8a#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.inMemoryTableScanStatistics.enable | 3.0.0 | SPARK-28257 | 42b80ae128ab1aa8a87c1376fe88e2cde52e6e4f#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.inMemoryColumnarStorage.enableVectorizedReader | 2.3.1 | SPARK-23312 | e5e9f9a430c827669ecfe9d5c13cc555fc89c980#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.columnVector.offheap.enabled | 2.3.0 | SPARK-20101 | 572af5027e45ca96e0d283a8bf7c84dcf476f9bc#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.join.preferSortMergeJoin | 2.0.0 | SPARK-13977 | 9c23c818ca0175c8f2a4a66eac261ec251d27c97#diff-32bb9518401c0948c5ea19377b5069ab |  
spark.sql.sort.enableRadixSort | 2.0.0 | SPARK-14724 | e2b5647ab92eb478b3f7b36a0ce6faf83e24c0e5#diff-32bb9518401c0948c5ea19377b5069ab |
spark.sql.autoBroadcastJoinThreshold | 1.1.0 | SPARK-2393 | c7db274be79f448fda566208946cb50958ea9b1a#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.limit.scaleUpFactor | 2.1.1 | SPARK-19944 | 80ebca62cbdb7d5c8606e95a944164ab1a943694#diff-9a6b543db706f1a90f790783d6930a13 |
spark.sql.hive.advancedPartitionPredicatePushdown.enabled | 2.3.0 | SPARK-20331 | d8cada8d1d3fce979a4bc1f9879593206722a3b9#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.shuffle.partitions | 1.1.0 | SPARK-1508 | 08ed9ad81397b71206c4dc903bfb94b6105691ed#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.adaptive.enabled | 1.6.0 | SPARK-9858 and SPARK-9859 and SPARK-9861 | d728d5c98658c44ed2949b55d36edeaa46f8c980#diff-41ef65b9ef5b518f77e2a03559893f4d |
spark.sql.adaptive.forceApply | 3.0.0 | SPARK-30719 | b29cb1a82b1a1facf1dd040025db93d998dad4cd#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.shuffle.reducePostShufflePartitions | 3.0.0 | SPARK-30812 | b76bc0b1b8b2abd00a84f805af90ca4c5925faaa#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.shuffle.fetchShuffleBlocksInBatch | 3.0.0 | SPARK-30812 | b76bc0b1b8b2abd00a84f805af90ca4c5925faaa#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.shuffle.minNumPostShufflePartitions | 3.0.0 | SPARK-9853 | 8616109061efc5b23b24bb9ec4a3c0f2745903c1#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.shuffle.targetPostShuffleInputSize | 1.6.0 | SPARK-9858 and SPARK-9859 and SPARK-9861 | d728d5c98658c44ed2949b55d36edeaa46f8c980#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.adaptive.shuffle.maxNumPostShufflePartitions | 3.0.0 | SPARK-9853 | 8616109061efc5b23b24bb9ec4a3c0f2745903c1#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.shuffle.localShuffleReader.enabled | 3.0.0 | SPARK-29893 | 6e581cf164c3a2930966b270ac1406dc1195c942#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.skewedJoinOptimization.enabled | 3.0.0 | SPARK-30812 | b76bc0b1b8b2abd00a84f805af90ca4c5925faaa#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.skewedJoinOptimization.skewedPartitionFactor | 3.0.0 | SPARK-30812 | 5b36cdbbfef147e93b35eaa4f8e0bea9690b6d06#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin | 3.0.0 | SPARK-9853 and SPARK-29002 | 8616109061efc5b23b24bb9ec4a3c0f2745903c1#diff-9a6b543db706f1a90f790783d6930a13 and b2f06608b785f577999318c00f2c315f39d90889#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.subexpressionElimination.enabled | 1.6.0 | SPARK-10371 | f38509a763816f43a224653fe65e4645894c9fc4#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.caseSensitive | 1.4.0 | SPARK-4699 | 21bd7222e55b9cf684c072141998a0623a69f514#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.constraintPropagation.enabled | 2.2.0 | SPARK-19846 | e011004bedca47be998a0c14fe22a6f9bb5090cd#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parser.escapedStringLiterals | 2.2.1 | SPARK-20399 | 3d1908fd58fd9b1970cbffebdb731bfe4c776ad9#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.sources.fileCompressionFactor | 2.3.1 | SPARK-22790 | 0fc5533e53ad03eb67590ddd231f40c2713150c3#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parquet.mergeSchema | 1.5.0 | SPARK-8690 | 246265f2bb056d5e9011d3331b809471a24ff8d7#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.respectSummaryFiles | 1.5.0 | SPARK-8838 | 6175d6cfe795fbd88e3ee713fac375038a3993a8#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.binaryAsString | 1.1.1 | SPARK-2927 | de501e169f24e4573747aec85b7651c98633c028#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.int96AsTimestamp | 1.3.0 | SPARK-4987 | 67d52207b5cf2df37ca70daff2a160117510f55e#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.int96TimestampConversion | 2.3.0 | SPARK-12297 | acf7ef3154e094875fa89f30a78ab111b267db91#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parquet.outputTimestampType | 2.3.0 | SPARK-10365 | 21a7bfd5c324e6c82152229f1394f26afeae771c#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parquet.compression.codec | 1.1.1 | SPARK-3131 | 3a9d874d7a46ab8b015631d91ba479d9a0ba827f#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.filterPushdown | 1.2.0 | SPARK-4391 | 576688aa2a19bd4ba239a2b93af7947f983e5124#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.filterPushdown.date | 2.4.0 | SPARK-23727 | b02e76cbffe9e589b7a4e60f91250ca12a4420b2#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parquet.filterPushdown.timestamp | 2.4.0 | SPARK-24718 | 43e4e851b642bbee535d22e1b9e72ec6b99f6ed4#diff-9a6b543db706f1a90f790783d6930a13 |
spark.sql.parquet.filterPushdown.decimal | 2.4.0 | SPARK-24549 | 9549a2814951f9ba969955d78ac4bd2240f85989#diff-9a6b543db706f1a90f790783d6930a13 |
spark.sql.parquet.filterPushdown.string.startsWith | 2.4.0 | SPARK-24638 | 03545ce6de08bd0ad685c5f59b73bc22dfc40887#diff-9a6b543db706f1a90f790783d6930a13 | 
spark.sql.parquet.pushdown.inFilterThreshold | 2.4.0 | SPARK-17091 | e1de34113e057707dfc5ff54a8109b3ec7c16dfb#diff-9a6b543db706f1a90f790783d6930a13 |  
spark.sql.parquet.writeLegacyFormat | 1.6.0 | SPARK-10400 | 01cd688f5245cbb752863100b399b525b31c3510#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.output.committer.class | 1.5.0 | SPARK-8139 | 111d6b9b8a584b962b6ae80c7aa8c45845ce0099#diff-41ef65b9ef5b518f77e2a03559893f4d |  
spark.sql.parquet.enableVectorizedReader | 2.0.0 | SPARK-13486 | 2b2c8c33236677c916541f956f7b94bba014a9ce#diff-32bb9518401c0948c5ea19377b5069ab |
spark.sql.parquet.recordLevelFilter.enabled | 2.3.0 | SPARK-17310 | 673c67046598d33b9ecf864024ca7a937c1998d6#diff-9a6b543db706f1a90f790783d6930a13 |
spark.sql.parquet.columnarReaderBatchSize | 2.4.0 | SPARK-23188 | cc41245fa3f954f961541bf4b4275c28473042b8#diff-9a6b543db706f1a90f790783d6930a13 |  

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27691 from beliefer/add-version-to-sql-config-part-one.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/sql/internal/SQLConf.scala   | 64 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 99237585aa6d6..22293a8ff8086 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -175,6 +175,7 @@ object SQLConf {
   val ANALYZER_MAX_ITERATIONS = buildConf("spark.sql.analyzer.maxIterations")
     .internal()
     .doc("The max number of iterations the analyzer runs.")
+    .version("3.0.0")
     .intConf
     .createWithDefault(100)
 
@@ -183,12 +184,14 @@ object SQLConf {
       "specified by their rule names and separated by comma. It is not guaranteed that all the " +
       "rules in this configuration will eventually be excluded, as some rules are necessary " +
       "for correctness. The optimizer will log the rules that have indeed been excluded.")
+    .version("2.4.0")
     .stringConf
     .createOptional
 
   val OPTIMIZER_MAX_ITERATIONS = buildConf("spark.sql.optimizer.maxIterations")
     .internal()
     .doc("The max number of iterations the optimizer runs.")
+    .version("2.0.0")
     .intConf
     .createWithDefault(100)
 
@@ -196,6 +199,7 @@ object SQLConf {
     buildConf("spark.sql.optimizer.inSetConversionThreshold")
       .internal()
       .doc("The threshold of set size for InSet conversion.")
+      .version("2.0.0")
       .intConf
       .createWithDefault(10)
 
@@ -204,6 +208,7 @@ object SQLConf {
       .internal()
       .doc("Configures the max set size in InSet for which Spark will generate code with " +
         "switch statements. This is applicable only to bytes, shorts, ints, dates.")
+      .version("3.0.0")
       .intConf
       .checkValue(threshold => threshold >= 0 && threshold <= 600, "The max set size " +
         "for using switch statements in InSet must be non-negative and less than or equal to 600")
@@ -214,6 +219,7 @@ object SQLConf {
     .doc("Configures the log level for logging the change from the original plan to the new " +
       "plan after a rule or batch is applied. The value can be 'trace', 'debug', 'info', " +
       "'warn', or 'error'. The default log level is 'trace'.")
+    .version("3.0.0")
     .stringConf
     .transform(_.toUpperCase(Locale.ROOT))
     .checkValue(logLevel => Set("TRACE", "DEBUG", "INFO", "WARN", "ERROR").contains(logLevel),
@@ -225,6 +231,7 @@ object SQLConf {
     .internal()
     .doc("Configures a list of rules to be logged in the optimizer, in which the rules are " +
       "specified by their rule names and separated by comma.")
+    .version("3.0.0")
     .stringConf
     .createOptional
 
@@ -232,12 +239,14 @@ object SQLConf {
     .internal()
     .doc("Configures a list of batches to be logged in the optimizer, in which the batches " +
       "are specified by their batch names and separated by comma.")
+    .version("3.0.0")
     .stringConf
     .createOptional
 
   val DYNAMIC_PARTITION_PRUNING_ENABLED =
     buildConf("spark.sql.optimizer.dynamicPartitionPruning.enabled")
       .doc("When true, we will generate predicate for partition column when it's used as join key")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -247,6 +256,7 @@ object SQLConf {
       .doc("When true, distinct count statistics will be used for computing the data size of the " +
         "partitioned table after dynamic partition pruning, in order to evaluate if it is worth " +
         "adding an extra subquery as the pruning filter if broadcast reuse is not applicable.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -257,6 +267,7 @@ object SQLConf {
       "used as the fallback filter ratio for computing the data size of the partitioned table " +
       "after dynamic partition pruning, in order to evaluate if it is worth adding an extra " +
       "subquery as the pruning filter if broadcast reuse is not applicable.")
+    .version("3.0.0")
     .doubleConf
     .createWithDefault(0.5)
 
@@ -265,18 +276,21 @@ object SQLConf {
       .internal()
       .doc("When true, dynamic partition pruning will only apply when the broadcast exchange of " +
         "a broadcast hash join operation can be reused as the dynamic pruning filter.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
 
   val COMPRESS_CACHED = buildConf("spark.sql.inMemoryColumnarStorage.compressed")
     .doc("When set to true Spark SQL will automatically select a compression codec for each " +
       "column based on statistics of the data.")
+    .version("1.0.1")
     .booleanConf
     .createWithDefault(true)
 
   val COLUMN_BATCH_SIZE = buildConf("spark.sql.inMemoryColumnarStorage.batchSize")
     .doc("Controls the size of batches for columnar caching.  Larger batch sizes can improve " +
       "memory utilization and compression, but risk OOMs when caching data.")
+    .version("1.1.1")
     .intConf
     .createWithDefault(10000)
 
@@ -284,6 +298,7 @@ object SQLConf {
     buildConf("spark.sql.inMemoryColumnarStorage.partitionPruning")
       .internal()
       .doc("When true, enable partition pruning for in-memory columnar tables.")
+      .version("1.2.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -291,12 +306,14 @@ object SQLConf {
     buildConf("spark.sql.inMemoryTableScanStatistics.enable")
       .internal()
       .doc("When true, enable in-memory table scan accumulators.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(false)
 
   val CACHE_VECTORIZED_READER_ENABLED =
     buildConf("spark.sql.inMemoryColumnarStorage.enableVectorizedReader")
       .doc("Enables vectorized reader for columnar caching.")
+      .version("2.3.1")
       .booleanConf
       .createWithDefault(true)
 
@@ -304,12 +321,14 @@ object SQLConf {
     buildConf("spark.sql.columnVector.offheap.enabled")
       .internal()
       .doc("When true, use OffHeapColumnVector in ColumnarBatch.")
+      .version("2.3.0")
       .booleanConf
       .createWithDefault(false)
 
   val PREFER_SORTMERGEJOIN = buildConf("spark.sql.join.preferSortMergeJoin")
     .internal()
     .doc("When true, prefer sort merge join over shuffle hash join.")
+    .version("2.0.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -318,6 +337,7 @@ object SQLConf {
     .doc("When true, enable use of radix sort when possible. Radix sort is much faster but " +
       "requires additional memory to be reserved up-front. The memory overhead may be " +
       "significant when sorting very small rows (up to 50% more in this case).")
+    .version("2.0.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -328,6 +348,7 @@ object SQLConf {
       "command `ANALYZE TABLE <tableName> COMPUTE STATISTICS noscan` has been " +
       "run, and file-based data source tables where the statistics are computed directly on " +
       "the files of data.")
+    .version("1.1.0")
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("10MB")
 
@@ -336,6 +357,7 @@ object SQLConf {
     .doc("Minimal increase rate in number of partitions between attempts when executing a take " +
       "on a query. Higher values lead to more partitions read. Lower values might lead to " +
       "longer execution times as more jobs will be run")
+    .version("2.1.1")
     .intConf
     .createWithDefault(4)
 
@@ -343,6 +365,7 @@ object SQLConf {
     buildConf("spark.sql.hive.advancedPartitionPredicatePushdown.enabled")
       .internal()
       .doc("When true, advanced partition predicate pushdown into Hive metastore is enabled.")
+      .version("2.3.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -350,12 +373,14 @@ object SQLConf {
     .doc("The default number of partitions to use when shuffling data for joins or aggregations. " +
       "Note: For structured streaming, this configuration cannot be changed between query " +
       "restarts from the same checkpoint location.")
+    .version("1.1.0")
     .intConf
     .checkValue(_ > 0, "The value of spark.sql.shuffle.partitions must be positive")
     .createWithDefault(200)
 
   val ADAPTIVE_EXECUTION_ENABLED = buildConf("spark.sql.adaptive.enabled")
     .doc("When true, enable adaptive query execution.")
+    .version("1.6.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -365,6 +390,7 @@ object SQLConf {
       "sub-queries. By setting this config to true (together with " +
       s"'${ADAPTIVE_EXECUTION_ENABLED.key}' enabled), Spark will force apply adaptive query " +
       "execution for all supported queries.")
+    .version("3.0.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -372,6 +398,7 @@ object SQLConf {
     buildConf("spark.sql.adaptive.shuffle.reducePostShufflePartitions")
       .doc(s"When true and '${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled, this enables reducing " +
         "the number of post-shuffle partitions based on map output statistics.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -384,6 +411,7 @@ object SQLConf {
         s"'${REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key}' is enabled, this feature also depends " +
         "on a relocatable serializer, the concatenation support codec in use and the new version " +
         "shuffle fetch protocol.")
+      .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -392,6 +420,7 @@ object SQLConf {
       .doc("The advisory minimum number of post-shuffle partitions used when " +
         s"'${ADAPTIVE_EXECUTION_ENABLED.key}' and " +
         s"'${REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key}' is enabled.")
+      .version("3.0.0")
       .intConf
       .checkValue(_ > 0, "The minimum shuffle partition number " +
         "must be a positive integer.")
@@ -402,6 +431,7 @@ object SQLConf {
       .doc("The target post-shuffle input size in bytes of a task. This configuration only has " +
         s"an effect when '${ADAPTIVE_EXECUTION_ENABLED.key}' and " +
         s"'${REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key}' is enabled.")
+      .version("1.6.0")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefaultString("64MB")
 
@@ -412,6 +442,7 @@ object SQLConf {
         "spark.sql.shuffle.partitions. This configuration only has an effect when " +
         s"'${ADAPTIVE_EXECUTION_ENABLED.key}' and " +
         s"'${REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key}' is enabled.")
+      .version("3.0.0")
       .intConf
       .checkValue(_ > 0, "The maximum shuffle partition number " +
         "must be a positive integer.")
@@ -422,6 +453,7 @@ object SQLConf {
     .doc(s"When true and '${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled, this enables the " +
       "optimization of converting the shuffle reader to local shuffle reader for the shuffle " +
       "exchange of the broadcast hash join in probe side.")
+    .version("3.0.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -429,6 +461,7 @@ object SQLConf {
     buildConf("spark.sql.adaptive.skewedJoinOptimization.enabled")
     .doc("When true and adaptive execution is enabled, a skewed join is automatically handled at " +
       "runtime.")
+    .version("3.0.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -437,6 +470,7 @@ object SQLConf {
       .doc("A partition is considered as a skewed partition if its size is larger than" +
         " this factor multiple the median partition size and also larger than " +
         s" ${SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key}")
+      .version("3.0.0")
       .intConf
       .checkValue(_ > 0, "The skew factor must be positive.")
       .createWithDefault(10)
@@ -447,6 +481,7 @@ object SQLConf {
         "considered as the build side of a broadcast-hash join in adaptive execution regardless " +
         "of its size.This configuration only has an effect when " +
         s"'${ADAPTIVE_EXECUTION_ENABLED.key}' is enabled.")
+      .version("3.0.0")
       .doubleConf
       .checkValue(_ >= 0, "The non-empty partition ratio must be positive number.")
       .createWithDefault(0.2)
@@ -455,6 +490,7 @@ object SQLConf {
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
       .doc("When true, common subexpressions will be eliminated.")
+      .version("1.6.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -462,6 +498,7 @@ object SQLConf {
     .internal()
     .doc("Whether the query analyzer should be case sensitive or not. " +
       "Default to case insensitive. It is highly discouraged to turn on case sensitive mode.")
+    .version("1.4.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -471,6 +508,7 @@ object SQLConf {
       "plan to optimize them. Constraint propagation can sometimes be computationally expensive " +
       "for certain kinds of query plans (such as those with a large number of predicates and " +
       "aliases) which might negatively impact overall runtime.")
+    .version("2.2.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -479,6 +517,7 @@ object SQLConf {
     .doc("When true, string literals (including regex patterns) remain escaped in our SQL " +
       "parser. The default is false since Spark 2.0. Setting it to true can restore the behavior " +
       "prior to Spark 2.0.")
+    .version("2.2.1")
     .booleanConf
     .createWithDefault(false)
 
@@ -487,6 +526,7 @@ object SQLConf {
     .doc("When estimating the output data size of a table scan, multiply the file size with this " +
       "factor as the estimated data size, in case the data is compressed in the file and lead to" +
       " a heavily underestimated result.")
+    .version("2.3.1")
     .doubleConf
     .checkValue(_ > 0, "the value of fileDataSizeFactor must be greater than 0")
     .createWithDefault(1.0)
@@ -495,6 +535,7 @@ object SQLConf {
     .doc("When true, the Parquet data source merges schemas collected from all data files, " +
          "otherwise the schema is picked from the summary file or a random data file " +
          "if no summary file is available.")
+    .version("1.5.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -503,6 +544,7 @@ object SQLConf {
          "summary files and we will ignore them when merging schema. Otherwise, if this is " +
          "false, which is the default, we will merge all part-files. This should be considered " +
          "as expert-only option, and shouldn't be enabled before knowing what it means exactly.")
+    .version("1.5.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -511,6 +553,7 @@ object SQLConf {
       "Spark SQL, do not differentiate between binary data and strings when writing out the " +
       "Parquet schema. This flag tells Spark SQL to interpret binary data as a string to provide " +
       "compatibility with these systems.")
+    .version("1.1.1")
     .booleanConf
     .createWithDefault(false)
 
@@ -519,6 +562,7 @@ object SQLConf {
       "Spark would also store Timestamp as INT96 because we need to avoid precision lost of the " +
       "nanoseconds field. This flag tells Spark SQL to interpret INT96 data as a timestamp to " +
       "provide compatibility with these systems.")
+    .version("1.3.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -526,6 +570,7 @@ object SQLConf {
     .doc("This controls whether timestamp adjustments should be applied to INT96 data when " +
       "converting to timestamps, for data written by Impala.  This is necessary because Impala " +
       "stores INT96 data with a different timezone offset than Hive & Spark.")
+    .version("2.3.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -539,6 +584,7 @@ object SQLConf {
       "is a standard timestamp type in Parquet, which stores number of microseconds from the " +
       "Unix epoch. TIMESTAMP_MILLIS is also standard, but with millisecond precision, which " +
       "means Spark has to truncate the microsecond portion of its timestamp value.")
+    .version("2.3.0")
     .stringConf
     .transform(_.toUpperCase(Locale.ROOT))
     .checkValues(ParquetOutputTimestampType.values.map(_.toString))
@@ -550,6 +596,7 @@ object SQLConf {
       "precedence would be `compression`, `parquet.compression`, " +
       "`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
       "snappy, gzip, lzo, brotli, lz4, zstd.")
+    .version("1.1.1")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo", "lz4", "brotli", "zstd"))
@@ -557,6 +604,7 @@ object SQLConf {
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
     .doc("Enables Parquet filter push-down optimization when set to true.")
+    .version("1.2.0")
     .booleanConf
     .createWithDefault(true)
 
@@ -564,6 +612,7 @@ object SQLConf {
     .doc("If true, enables Parquet filter push-down optimization for Date. " +
       s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is " +
       "enabled.")
+    .version("2.4.0")
     .internal()
     .booleanConf
     .createWithDefault(true)
@@ -573,15 +622,17 @@ object SQLConf {
       .doc("If true, enables Parquet filter push-down optimization for Timestamp. " +
         s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is " +
         "enabled and Timestamp stored as TIMESTAMP_MICROS or TIMESTAMP_MILLIS type.")
-    .internal()
-    .booleanConf
-    .createWithDefault(true)
+      .version("2.4.0")
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
 
   val PARQUET_FILTER_PUSHDOWN_DECIMAL_ENABLED =
     buildConf("spark.sql.parquet.filterPushdown.decimal")
       .doc("If true, enables Parquet filter push-down optimization for Decimal. " +
         s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is " +
         "enabled.")
+      .version("2.4.0")
       .internal()
       .booleanConf
       .createWithDefault(true)
@@ -591,6 +642,7 @@ object SQLConf {
     .doc("If true, enables Parquet filter push-down optimization for string startsWith function. " +
       s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is " +
       "enabled.")
+    .version("2.4.0")
     .internal()
     .booleanConf
     .createWithDefault(true)
@@ -603,6 +655,7 @@ object SQLConf {
         "By setting this value to 0 this feature can be disabled. " +
         s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is " +
         "enabled.")
+      .version("2.4.0")
       .internal()
       .intConf
       .checkValue(threshold => threshold >= 0, "The threshold must not be negative.")
@@ -614,6 +667,7 @@ object SQLConf {
       "systems such as Apache Hive and Apache Impala use. If false, the newer format in Parquet " +
       "will be used. For example, decimals will be written in int-based format. If Parquet " +
       "output is intended for use with systems that do not support this newer format, set to true.")
+    .version("1.6.0")
     .booleanConf
     .createWithDefault(false)
 
@@ -623,6 +677,7 @@ object SQLConf {
       "of org.apache.parquet.hadoop.ParquetOutputCommitter. If it is not, then metadata " +
       "summaries will never be created, irrespective of the value of " +
       "parquet.summary.metadata.level")
+    .version("1.5.0")
     .internal()
     .stringConf
     .createWithDefault("org.apache.parquet.hadoop.ParquetOutputCommitter")
@@ -630,6 +685,7 @@ object SQLConf {
   val PARQUET_VECTORIZED_READER_ENABLED =
     buildConf("spark.sql.parquet.enableVectorizedReader")
       .doc("Enables vectorized parquet decoding.")
+      .version("2.0.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -639,12 +695,14 @@ object SQLConf {
       s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' " +
       "is enabled and the vectorized reader is not used. You can ensure the vectorized reader " +
       s"is not used by setting '${PARQUET_VECTORIZED_READER_ENABLED.key}' to false.")
+    .version("2.3.0")
     .booleanConf
     .createWithDefault(false)
 
   val PARQUET_VECTORIZED_READER_BATCH_SIZE = buildConf("spark.sql.parquet.columnarReaderBatchSize")
     .doc("The number of rows to include in a parquet vectorized reader batch. The number should " +
       "be carefully chosen to minimize overhead and avoid OOMs in reading data.")
+    .version("2.4.0")
     .intConf
     .createWithDefault(4096)
 

From cd3ef2249fc6bd60e4ba62ee046dcf4115864206 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Wed, 26 Feb 2020 18:01:26 -0800
Subject: [PATCH 160/185] [SPARK-30906][SQL][TESTS][FOLLOW-UP] Set the
 configuration against TestHive explicitly in HiveSerDeSuite

### What changes were proposed in this pull request?

After https://github.com/apache/spark/pull/27659 (see https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/253/), the tests below fail consistently, specifically in one job https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/ in Jenkins

```
org.apache.spark.sql.hive.execution.HiveSerDeSuite.Test the default fileformat for Hive-serde tables
```

The profile is same as PR builder but seems it fails specifically in this machine.

Several configurations used in `HiveSerDeSuite` are not being set presumably due to the inconsistency between `SQLConf.get` and the active Spark session described in the https://github.com/apache/spark/pull/27387, and as a side effect of the cloned session at https://github.com/apache/spark/pull/27659.

This PR proposes to explicitly set the configuration against `TestHive` by using `withExistingConf` at `withSQLConf`

### Why are the changes needed?

To make `spark-master-test-sbt-hadoop-2.7-hive-2.3` job pass.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Cannot reproduce in my local. Presumably it cannot be reproduced in the PR builder. We should see if the tests pass at `spark-master-test-sbt-hadoop-2.7-hive-2.3` job after this PR is merged.

Closes #27705 from HyukjinKwon/SPARK-30906.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
---
 .../org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 9a1190af02fac..d2d350221aca0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -82,6 +82,10 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte
     }.head
   }
 
+  // Make sure we set the config values to TestHive.conf.
+  override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit =
+    SQLConf.withExistingConf(TestHive.conf)(super.withSQLConf(pairs: _*)(f))
+
   test("Test the default fileformat for Hive-serde tables") {
     withSQLConf("hive.default.fileformat" -> "orc",
       SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") {

From 325bf56e73a43d0b7c698b828386f6d0fa09a438 Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 27 Feb 2020 11:05:11 +0900
Subject: [PATCH 161/185] [SPARK-30888][CORE][DOC] Add version information to
 the configuration of Network
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
1.Add version information to the configuration of `Network`.
2.Update the docs of `Network`.

I sorted out some information show below.

Item name | Since version | JIRA ID | Commit ID | Note
-- | -- | -- | -- | --
spark.network.crypto.saslFallback | 2.2.0 | SPARK-19139 | 8f3f73abc1fe62496722476460c174af0250e3fe#diff-0ac65da2bc6b083fb861fe410c7688c2 |  
spark.network.crypto.enabled | 2.2.0 | SPARK-19139 | 8f3f73abc1fe62496722476460c174af0250e3fe#diff-6bdad48cfc34314e89599655442ff210 |  
spark.network.remoteReadNioBufferConversion | 2.4.0 | SPARK-24307 | 2c82745686f4456c4d5c84040a431dcb5b6cb60b#diff-2b643ea78c1add0381754b1f47eec132 |  
spark.network.timeout | 1.3.0 | SPARK-4688 | d3f07fd23cc26a70f44c52e24445974d4885d58a#diff-1df6b5af3d8f9f16255ff8c7a06f402f |  
spark.network.timeoutInterval | 1.3.2 | SPARK-5529 | ec196ab1c7569d7ab0a50c9d7338c2835f2c84d5#diff-47779b72f095f7e7f926898fa1a425ee |  
spark.rpc.askTimeout | 1.4.0 | SPARK-6490 | 8136810dfad12008ac300116df7bc8448740f1ae#diff-529fc5c06b9731c1fbda6f3db60b16aa |  
spark.rpc.connect.threads | 1.6.0 | SPARK-6028 | 084e4e126211d74a79e8dbd2d0e604dd3c650822#diff-0c89b4a60c30a7cd2224bb64d93da942 |  
spark.rpc.io.numConnectionsPerPeer | 1.6.0 | SPARK-10745 | 34a77679877bc40b58a10ec539a8da00fed7db39#diff-0c89b4a60c30a7cd2224bb64d93da942 |  
spark.rpc.io.threads | 1.6.0 | SPARK-6028 | 084e4e126211d74a79e8dbd2d0e604dd3c650822#diff-0c89b4a60c30a7cd2224bb64d93da942 |  
spark.rpc.lookupTimeout | 1.4.0 | SPARK-6490 | 8136810dfad12008ac300116df7bc8448740f1ae#diff-529fc5c06b9731c1fbda6f3db60b16aa |  
spark.rpc.message.maxSize | 2.0.0 | SPARK-7997 | bc1babd63da4ee56e6d371eb24805a5d714e8295#diff-529fc5c06b9731c1fbda6f3db60b16aa |  
spark.rpc.netty.dispatcher.numThreads | 1.6.0 | SPARK-11079 | 1797055dbf1d2fd7714d7c65c8d2efde2f15efc1#diff-05133dfc4bfdb6a27aa092d86ce24866 |  
spark.rpc.numRetries | 1.4.0 | SPARK-6490 | 8136810dfad12008ac300116df7bc8448740f1ae#diff-529fc5c06b9731c1fbda6f3db60b16aa |  
spark.rpc.retry.wait | 1.4.0 | SPARK-6490 | 8136810dfad12008ac300116df7bc8448740f1ae#diff-529fc5c06b9731c1fbda6f3db60b16aa |  

### Why are the changes needed?
Supplemental configuration version information.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27674 from beliefer/add-version-to-network-config.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/internal/config/Network.scala       | 14 +++++++++++++
 docs/configuration.md                         | 20 +++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/Network.scala b/core/src/main/scala/org/apache/spark/internal/config/Network.scala
index 129e31a82979f..0961d062cc04f 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/Network.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/Network.scala
@@ -23,71 +23,85 @@ private[spark] object Network {
 
   private[spark] val NETWORK_CRYPTO_SASL_FALLBACK =
     ConfigBuilder("spark.network.crypto.saslFallback")
+      .version("2.2.0")
       .booleanConf
       .createWithDefault(true)
 
   private[spark] val NETWORK_CRYPTO_ENABLED =
     ConfigBuilder("spark.network.crypto.enabled")
+      .version("2.2.0")
       .booleanConf
       .createWithDefault(false)
 
   private[spark] val NETWORK_REMOTE_READ_NIO_BUFFER_CONVERSION =
     ConfigBuilder("spark.network.remoteReadNioBufferConversion")
+      .version("2.4.0")
       .booleanConf
       .createWithDefault(false)
 
   private[spark] val NETWORK_TIMEOUT =
     ConfigBuilder("spark.network.timeout")
+      .version("1.3.0")
       .timeConf(TimeUnit.SECONDS)
       .createWithDefaultString("120s")
 
   private[spark] val NETWORK_TIMEOUT_INTERVAL =
     ConfigBuilder("spark.network.timeoutInterval")
+      .version("1.3.2")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString(STORAGE_BLOCKMANAGER_TIMEOUTINTERVAL.defaultValueString)
 
   private[spark] val RPC_ASK_TIMEOUT =
     ConfigBuilder("spark.rpc.askTimeout")
+      .version("1.4.0")
       .stringConf
       .createOptional
 
   private[spark] val RPC_CONNECT_THREADS =
     ConfigBuilder("spark.rpc.connect.threads")
+      .version("1.6.0")
       .intConf
       .createWithDefault(64)
 
   private[spark] val RPC_IO_NUM_CONNECTIONS_PER_PEER =
     ConfigBuilder("spark.rpc.io.numConnectionsPerPeer")
+      .version("1.6.0")
       .intConf
       .createWithDefault(1)
 
   private[spark] val RPC_IO_THREADS =
     ConfigBuilder("spark.rpc.io.threads")
+      .version("1.6.0")
       .intConf
       .createOptional
 
   private[spark] val RPC_LOOKUP_TIMEOUT =
     ConfigBuilder("spark.rpc.lookupTimeout")
+      .version("1.4.0")
       .stringConf
       .createOptional
 
   private[spark] val RPC_MESSAGE_MAX_SIZE =
     ConfigBuilder("spark.rpc.message.maxSize")
+      .version("2.0.0")
       .intConf
       .createWithDefault(128)
 
   private[spark] val RPC_NETTY_DISPATCHER_NUM_THREADS =
     ConfigBuilder("spark.rpc.netty.dispatcher.numThreads")
+      .version("1.6.0")
       .intConf
       .createOptional
 
   private[spark] val RPC_NUM_RETRIES =
     ConfigBuilder("spark.rpc.numRetries")
+      .version("1.4.0")
       .intConf
       .createWithDefault(3)
 
   private[spark] val RPC_RETRY_WAIT =
     ConfigBuilder("spark.rpc.retry.wait")
+      .version("1.4.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("3s")
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index f77f2a37f1e15..c6a8edec0206b 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1678,7 +1678,7 @@ Apart from these, the following properties are also available, and may be useful
 ### Networking
 
 <table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
 <tr>
   <td><code>spark.rpc.message.maxSize</code></td>
   <td>128</td>
@@ -1687,6 +1687,7 @@ Apart from these, the following properties are also available, and may be useful
     output size information sent between executors and the driver. Increase this if you are running
     jobs with many thousands of map and reduce tasks and see messages about the RPC message size.
   </td>
+  <td>2.0.0</td>
 </tr>
 <tr>
   <td><code>spark.blockManager.port</code></td>
@@ -1694,6 +1695,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Port for all block managers to listen on. These exist on both the driver and the executors.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.driver.blockManager.port</code></td>
@@ -1702,6 +1704,7 @@ Apart from these, the following properties are also available, and may be useful
     Driver-specific port for the block manager to listen on, for cases where it cannot use the same
     configuration as executors.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.driver.bindAddress</code></td>
@@ -1715,6 +1718,7 @@ Apart from these, the following properties are also available, and may be useful
     the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the
     container's host.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.driver.host</code></td>
@@ -1723,6 +1727,7 @@ Apart from these, the following properties are also available, and may be useful
     Hostname or IP address for the driver.
     This is used for communicating with the executors and the standalone Master.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.driver.port</code></td>
@@ -1731,6 +1736,7 @@ Apart from these, the following properties are also available, and may be useful
     Port for the driver to listen on.
     This is used for communicating with the executors and the standalone Master.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.rpc.io.backLog</code></td>
@@ -1740,6 +1746,7 @@ Apart from these, the following properties are also available, and may be useful
     need to be increased, so that incoming connections are not dropped when a large number of
     connections arrives in a short period of time.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.network.timeout</code></td>
@@ -1751,6 +1758,7 @@ Apart from these, the following properties are also available, and may be useful
     <code>spark.shuffle.io.connectionTimeout</code>, <code>spark.rpc.askTimeout</code> or
     <code>spark.rpc.lookupTimeout</code> if they are not configured.
   </td>
+  <td>1.3.0</td>
 </tr>
 <tr>
   <td><code>spark.network.io.preferDirectBufs</code></td>
@@ -1760,7 +1768,8 @@ Apart from these, the following properties are also available, and may be useful
     Off-heap buffers are used to reduce garbage collection during shuffle and cache
     block transfer. For environments where off-heap memory is tightly limited, users may wish to
     turn this off to force all allocations to be on-heap.
-    </td>
+  </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.port.maxRetries</code></td>
@@ -1772,6 +1781,7 @@ Apart from these, the following properties are also available, and may be useful
     essentially allows it to try a range of ports from the start port specified
     to port + maxRetries.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.rpc.numRetries</code></td>
@@ -1780,6 +1790,7 @@ Apart from these, the following properties are also available, and may be useful
     Number of times to retry before an RPC task gives up.
     An RPC task will run at most times of this number.
   </td>
+  <td>1.4.0</td>
 </tr>
 <tr>
   <td><code>spark.rpc.retry.wait</code></td>
@@ -1787,6 +1798,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Duration for an RPC ask operation to wait before retrying.
   </td>
+  <td>1.4.0</td>
 </tr>
 <tr>
   <td><code>spark.rpc.askTimeout</code></td>
@@ -1794,6 +1806,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Duration for an RPC ask operation to wait before timing out.
   </td>
+  <td>1.4.0</td>
 </tr>
 <tr>
   <td><code>spark.rpc.lookupTimeout</code></td>
@@ -1801,6 +1814,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Duration for an RPC remote endpoint lookup operation to wait before timing out.
   </td>
+  <td>1.4.0</td>
 </tr>
 <tr>
   <td><code>spark.core.connection.ack.wait.timeout</code></td>
@@ -1810,6 +1824,7 @@ Apart from these, the following properties are also available, and may be useful
     out and giving up. To avoid unwilling timeout caused by long pause like GC,
     you can set larger value.
   </td>
+  <td></td>
 </tr>
 <tr>
   <td><code>spark.network.maxRemoteBlockSizeFetchToMem</code></td>
@@ -1821,6 +1836,7 @@ Apart from these, the following properties are also available, and may be useful
     For users who enabled external shuffle service, this feature can only work when
     external shuffle service is at least 2.3.0.
   </td>
+  <td></td>
 </tr>
 </table>
 

From 2b744fe88547acc82a6178ff806b78056db7a025 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Wed, 26 Feb 2020 19:24:41 -0800
Subject: [PATCH 162/185] [SPARK-30963][INFRA] Add GitHub Action job for
 document generation

### What changes were proposed in this pull request?

This PR aims to add a new `GitHub Action` job for document generation.

### Why are the changes needed?

We had better test the document generation in PR Builder.
- https://lists.apache.org/thread.html/rd06a2154e853812652b8f7fa3c003746ed531b213c531517f055e1dc%40%3Cdev.spark.apache.org%3E

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass the GitHub Action in this PR.

Closes #27715 from dongjoon-hyun/SPARK-30963.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .github/workflows/master.yml | 37 ++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index d53119ad75599..632a0f1aa91fd 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -117,3 +117,40 @@ jobs:
       run: ./R/install-dev.sh
     - name: lint-r
       run: ./dev/lint-r
+
+  docs:
+    runs-on: ubuntu-latest
+    name: Generate documents
+    steps:
+    - uses: actions/checkout@master
+    - uses: actions/cache@v1
+      with:
+        path: ~/.m2/repository
+        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          docs-maven-repo-
+    - uses: actions/setup-java@v1
+      with:
+        java-version: '1.8'
+    - uses: actions/setup-python@v1
+      with:
+        python-version: '3.x'
+        architecture: 'x64'
+    - uses: actions/setup-ruby@v1
+      with:
+        ruby-version: '2.7'
+    - name: Install R
+      run: |
+        echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' | sudo tee -a /etc/apt/sources.list
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev pandoc
+    - name: Install packages
+      run: |
+        pip install sphinx mkdocs numpy
+        gem install jekyll jekyll-redirect-from rouge
+        sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
+    - name: Run jekyll build
+      run: |
+        cd docs
+        jekyll build

From 1515d45b8db69de67cf61100f7cf4fa5c7cadbd4 Mon Sep 17 00:00:00 2001
From: beliefer <beliefer@163.com>
Date: Thu, 27 Feb 2020 13:42:02 +0800
Subject: [PATCH 163/185] [SPARK-27924][SQL][FOLLOW-UP] Improve ANSI SQL
 Boolean-Predicate

### What changes were proposed in this pull request?
This PR follows https://github.com/apache/spark/pull/25074 and improves the implement.

### Why are the changes needed?
Improve code.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Exists UT

Closes #27699 from beliefer/improve-boolean-test.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/predicates.scala | 60 -------------------
 .../sql/catalyst/parser/AstBuilder.scala      |  8 +--
 .../catalyst/expressions/PredicateSuite.scala | 32 +---------
 3 files changed, 6 insertions(+), 94 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index bcd442ad3cc35..bd190c3e5abc7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -927,66 +927,6 @@ case class GreaterThanOrEqual(left: Expression, right: Expression)
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2)
 }
 
-trait BooleanTest extends UnaryExpression with Predicate with ExpectsInputTypes {
-
-  def boolValueForComparison: Boolean
-  def boolValueWhenNull: Boolean
-
-  override def nullable: Boolean = false
-  override def inputTypes: Seq[DataType] = Seq(BooleanType)
-
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    Option(value) match {
-      case None => boolValueWhenNull
-      case other => if (boolValueWhenNull) {
-        value == !boolValueForComparison
-      } else {
-        value == boolValueForComparison
-      }
-    }
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val eval = child.genCode(ctx)
-    ev.copy(code = code"""
-      ${eval.code}
-      ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
-      if (${eval.isNull}) {
-        ${ev.value} = $boolValueWhenNull;
-      } else if ($boolValueWhenNull) {
-        ${ev.value} = ${eval.value} == !$boolValueForComparison;
-      } else {
-        ${ev.value} = ${eval.value} == $boolValueForComparison;
-      }
-      """, isNull = FalseLiteral)
-  }
-}
-
-case class IsTrue(child: Expression) extends BooleanTest {
-  override def boolValueForComparison: Boolean = true
-  override def boolValueWhenNull: Boolean = false
-  override def sql: String = s"(${child.sql} IS TRUE)"
-}
-
-case class IsNotTrue(child: Expression) extends BooleanTest {
-  override def boolValueForComparison: Boolean = true
-  override def boolValueWhenNull: Boolean = true
-  override def sql: String = s"(${child.sql} IS NOT TRUE)"
-}
-
-case class IsFalse(child: Expression) extends BooleanTest {
-  override def boolValueForComparison: Boolean = false
-  override def boolValueWhenNull: Boolean = false
-  override def sql: String = s"(${child.sql} IS FALSE)"
-}
-
-case class IsNotFalse(child: Expression) extends BooleanTest {
-  override def boolValueForComparison: Boolean = false
-  override def boolValueWhenNull: Boolean = true
-  override def sql: String = s"(${child.sql} IS NOT FALSE)"
-}
-
 /**
  * IS UNKNOWN and IS NOT UNKNOWN are the same as IS NULL and IS NOT NULL, respectively,
  * except that the input expression must be of a boolean type.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 883128a8a2b06..30b278b7c52ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1414,12 +1414,12 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       case SqlBaseParser.NULL =>
         IsNull(e)
       case SqlBaseParser.TRUE => ctx.NOT match {
-        case null => IsTrue(e)
-        case _ => IsNotTrue(e)
+        case null => EqualNullSafe(e, Literal(true))
+        case _ => Not(EqualNullSafe(e, Literal(true)))
       }
       case SqlBaseParser.FALSE => ctx.NOT match {
-        case null => IsFalse(e)
-        case _ => IsNotFalse(e)
+        case null => EqualNullSafe(e, Literal(false))
+        case _ => Not(EqualNullSafe(e, Literal(false)))
       }
       case SqlBaseParser.UNKNOWN => ctx.NOT match {
         case null => IsUnknown(e)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 67a41e7cc2767..1ad0a8ed758f4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -522,37 +522,9 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(expression == expected)
   }
 
-  val row0 = create_row(null)
-  val row1 = create_row(false)
-  val row2 = create_row(true)
-
-  test("istrue and isnottrue") {
-    checkEvaluation(IsTrue(Literal.create(null, BooleanType)), false, row0)
-    checkEvaluation(IsNotTrue(Literal.create(null, BooleanType)), true, row0)
-    checkEvaluation(IsTrue(Literal.create(false, BooleanType)), false, row1)
-    checkEvaluation(IsNotTrue(Literal.create(false, BooleanType)), true, row1)
-    checkEvaluation(IsTrue(Literal.create(true, BooleanType)), true, row2)
-    checkEvaluation(IsNotTrue(Literal.create(true, BooleanType)), false, row2)
-    IsTrue(Literal.create(null, IntegerType)).checkInputDataTypes() match {
-      case TypeCheckResult.TypeCheckFailure(msg) =>
-        assert(msg.contains("argument 1 requires boolean type"))
-    }
-  }
-
-  test("isfalse and isnotfalse") {
-    checkEvaluation(IsFalse(Literal.create(null, BooleanType)), false, row0)
-    checkEvaluation(IsNotFalse(Literal.create(null, BooleanType)), true, row0)
-    checkEvaluation(IsFalse(Literal.create(false, BooleanType)), true, row1)
-    checkEvaluation(IsNotFalse(Literal.create(false, BooleanType)), false, row1)
-    checkEvaluation(IsFalse(Literal.create(true, BooleanType)), false, row2)
-    checkEvaluation(IsNotFalse(Literal.create(true, BooleanType)), true, row2)
-    IsFalse(Literal.create(null, IntegerType)).checkInputDataTypes() match {
-      case TypeCheckResult.TypeCheckFailure(msg) =>
-        assert(msg.contains("argument 1 requires boolean type"))
-    }
-  }
-
   test("isunknown and isnotunknown") {
+    val row0 = create_row(null)
+
     checkEvaluation(IsUnknown(Literal.create(null, BooleanType)), true, row0)
     checkEvaluation(IsNotUnknown(Literal.create(null, BooleanType)), false, row0)
     IsUnknown(Literal.create(null, IntegerType)).checkInputDataTypes() match {

From 160c144baa82b7f76301b718694187cf939ca465 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 27 Feb 2020 14:09:07 +0800
Subject: [PATCH 164/185] [SPARK-30590][SQL] Untyped select API cannot take
 typed column expression that needs input type

### What changes were proposed in this pull request?

This patch proposes to throw clear analysis exception if untyped `Dataset.select` takes typed column expression that needs input type.

### Why are the changes needed?

`Dataset` provides few typed `select` helper functions to select typed column expressions. The maximum number of typed columns supported is 5. If wanting to select more than 5 typed columns, it silently calls untyped `Dataset.select` and can causes weird unresolved error, like:

```
org.apache.spark.sql.AnalysisException: unresolved operator 'Aggregate [fooagg(FooAgg(1), None, None, None, input[0, int, false] AS value#114, assertnotnull(cast(value#114 as int)), input[0, int, false] AS value#113, IntegerType, IntegerType, false) AS foo_agg_1#116, fooagg(FooAgg(2), None, None, None, input[0, int, false] AS value#119, assertnotnull(cast(value#119 as int)), input[0, int, false] AS value#118, IntegerType, IntegerType, false) AS foo_agg_2#121, fooagg(FooAgg(3), None, None, None, input[0, int, false] AS value#124, assertnotnull(cast(value#124 as int)), input[0, int, false] AS value#123, IntegerType, IntegerType, false) AS foo_agg_3#126, fooagg(FooAgg(4), None, None, None, input[0, int, false] AS value#129, assertnotnull(cast(value#129 as int)), input[0, int, false] AS value#128, IntegerType, IntegerType, false) AS foo_agg_4#131, fooagg(FooAgg(5), None, None, None, input[0, int, false] AS value#134, assertnotnull(cast(value#134 as int)), input[0, int, false] AS value#133, IntegerType, IntegerType, false) AS foo_agg_5#136, fooagg(FooAgg(6), None, None, None, input[0, int, false] AS value#139, assertnotnull(cast(value#139 as int)), input[0, int, false] AS value#138, IntegerType, IntegerType, false) AS foo_agg_6#141];;
'Aggregate [fooagg(FooAgg(1), None, None, None, input[0, int, false] AS value#114, assertnotnull(cast(value#114 as int)), input[0, int, false] AS value#113, IntegerType, IntegerType, false) AS foo_agg_1#116, fooagg(FooAgg(2), None, None, None, input[0, int, false] AS value#119, assertnotnull(cast(value#119 as int)), input[0, int, false] AS value#118, IntegerType, IntegerType, false) AS foo_agg_2#121, fooagg(FooAgg(3), None, None, None, input[0, int, false] AS value#124, assertnotnull(cast(value#124 as int)), input[0, int, false] AS value#123, IntegerType, IntegerType, false) AS foo_agg_3#126, fooagg(FooAgg(4), None, None, None, input[0, int, false] AS value#129, assertnotnull(cast(value#129 as int)), input[0, int, false] AS value#128, IntegerType, IntegerType, false) AS foo_agg_4#131, fooagg(FooAgg(5), None, None, None, input[0, int, false] AS value#134, assertnotnull(cast(value#134 as int)), input[0, int, false] AS value#133, IntegerType, IntegerType, false) AS foo_agg_5#136, fooagg(FooAgg(6), None, None, None, input[0, int, false] AS value#139, assertnotnull(cast(value#139 as int)), input[0, int, false] AS value#138, IntegerType, IntegerType, false) AS foo_agg_6#141]
+- Project [_1#6 AS a#13, _2#7 AS b#14, _3#8 AS c#15, _4#9 AS d#16, _5#10 AS e#17, _6#11 AS F#18]
 +- LocalRelation [_1#6, _2#7, _3#8, _4#9, _5#10, _6#11]

at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:43)
 at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:95)
 at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$3.apply(CheckAnalysis.scala:431)
 at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$3.apply(CheckAnalysis.scala:430)
 at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
 at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:430)
```

However, to fully disallow typed columns as input to untyped `select` API will break current usage like `count` that is a `TypedColumn` in `functions`. In order to keep compatibility, we should allow current usage of certain `TypedColumn`s as input to untyped `select` API. For the `TypedColumn`s that will cause unresolved exception, we should explicitly let users know that they are incorrectly calling untyped `select` with typed columns which need input type.

### Does this PR introduce any user-facing change?

Yes, but this PR only refines the error message.

When users call `Dataset.select` API with typed column that needs input type, an analysis exception will be thrown. Previously an unresolved error will be thrown.

### How was this patch tested?

Unit tests.

Closes #27499 from viirya/SPARK-30590.

Lead-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Co-authored-by: Liang-Chi Hsieh <liangchi@uber.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 21 +++++++++++++++-
 .../spark/sql/DatasetAggregatorSuite.scala    | 24 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 5cd258341c183..e3c63881b07ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -48,6 +48,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningC
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.util.IntervalUtils
 import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.execution.arrow.{ArrowBatchStreamWriter, ArrowConverters}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -1432,7 +1433,25 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def select(cols: Column*): DataFrame = withPlan {
-    Project(cols.map(_.named), logicalPlan)
+    val untypedCols = cols.map {
+      case typedCol: TypedColumn[_, _] =>
+        // Checks if a `TypedColumn` has been inserted with
+        // specific input type and schema by `withInputType`.
+        val needInputType = typedCol.expr.find {
+          case ta: TypedAggregateExpression if ta.inputDeserializer.isEmpty => true
+          case _ => false
+        }.isDefined
+
+        if (!needInputType) {
+          typedCol
+        } else {
+          throw new AnalysisException(s"Typed column $typedCol that needs input type and schema " +
+            "cannot be passed in untyped `select` API. Use the typed `Dataset.select` API instead.")
+        }
+
+      case other => other
+    }
+    Project(untypedCols.map(_.named), logicalPlan)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index 6ffe133ee652b..a22abd505ca00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -219,6 +219,15 @@ case class OptionBooleanIntAggregator(colName: String)
   def OptionalBoolIntEncoder: Encoder[Option[(Boolean, Int)]] = ExpressionEncoder()
 }
 
+case class FooAgg(s: Int) extends Aggregator[Row, Int, Int] {
+  def zero: Int = s
+  def reduce(b: Int, r: Row): Int = b + r.getAs[Int](0)
+  def merge(b1: Int, b2: Int): Int = b1 + b2
+  def finish(b: Int): Int = b
+  def bufferEncoder: Encoder[Int] = Encoders.scalaInt
+  def outputEncoder: Encoder[Int] = Encoders.scalaInt
+}
+
 class DatasetAggregatorSuite extends QueryTest with SharedSparkSession {
   import testImplicits._
 
@@ -394,4 +403,19 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession {
     checkAnswer(group, Row("bob", Row(true, 3)) :: Nil)
     checkDataset(group.as[OptionBooleanIntData], OptionBooleanIntData("bob", Some((true, 3))))
   }
+
+  test("SPARK-30590: untyped select should not accept typed column that needs input type") {
+    val df = Seq((1, 2, 3, 4, 5, 6)).toDF("a", "b", "c", "d", "e", "f")
+    val fooAgg = (i: Int) => FooAgg(i).toColumn.name(s"foo_agg_$i")
+
+    val agg1 = df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5))
+    checkDataset(agg1, (3, 5, 7, 9, 11))
+
+    // Passes typed columns to untyped `Dataset.select` API.
+    val err = intercept[AnalysisException] {
+      df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5), fooAgg(6))
+    }.getMessage
+    assert(err.contains("cannot be passed in untyped `select` API. " +
+      "Use the typed `Dataset.select` API instead."))
+  }
 }

From bce8d9354ceacbf82a4459cc8766a2e04e9fec26 Mon Sep 17 00:00:00 2001
From: Eric Wu <492960551@qq.com>
Date: Thu, 27 Feb 2020 16:40:10 +0900
Subject: [PATCH 165/185] [SPARK-30765][SQL] Refine base operator abstraction
 code style

### What changes were proposed in this pull request?
When doing base operator abstraction work, we found there are still some code snippet is  inconsistent with other abstraction code style. This PR addressed following two code refactor cases.

**Case 1** Override keyword missed for some fields in derived classes. The compiler will not capture it if we rename some fields in the future.

**Case 2** Inconsistent abstract class field definition. The updated style will simplify derived class definition, e.g. `EvalPythonExec` `WindowExecBase`

### Why are the changes needed?
Improve the code style consistency and code quality

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Existing tests

Closes #27511 from Eric5553/BaseClassAbstraction.

Authored-by: Eric Wu <492960551@qq.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/sql/execution/DataSourceScanExec.scala   |  8 ++++----
 .../spark/sql/execution/joins/HashJoin.scala       | 14 +++++++-------
 .../sql/execution/python/ArrowEvalPythonExec.scala |  4 ++--
 .../sql/execution/python/BatchEvalPythonExec.scala |  2 +-
 .../sql/execution/python/EvalPythonExec.scala      |  5 +++--
 .../sql/execution/python/WindowInPandasExec.scala  |  2 +-
 .../streaming/FlatMapGroupsWithStateExec.scala     |  2 +-
 .../spark/sql/execution/window/WindowExec.scala    |  2 +-
 .../sql/execution/window/WindowExecBase.scala      |  9 ++++-----
 9 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index c1f7f0a195491..45d7af2114f0e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -43,8 +43,8 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.BitSet
 
 trait DataSourceScanExec extends LeafExecNode {
-  val relation: BaseRelation
-  val tableIdentifier: Option[TableIdentifier]
+  def relation: BaseRelation
+  def tableIdentifier: Option[TableIdentifier]
 
   protected val nodeNamePrefix: String = ""
 
@@ -103,7 +103,7 @@ case class RowDataSourceScanExec(
     handledFilters: Set[Filter],
     rdd: RDD[InternalRow],
     @transient relation: BaseRelation,
-    override val tableIdentifier: Option[TableIdentifier])
+    tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec with InputRDDCodegen {
 
   def output: Seq[Attribute] = requiredColumnsIndex.map(fullOutput)
@@ -164,7 +164,7 @@ case class FileSourceScanExec(
     partitionFilters: Seq[Expression],
     optionalBucketSet: Option[BitSet],
     dataFilters: Seq[Expression],
-    override val tableIdentifier: Option[TableIdentifier])
+    tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec {
 
   // Note that some vals referring the file-based relation are lazy intentionally
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 99cf60273bf08..f4796c194cb4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -29,13 +29,13 @@ import org.apache.spark.sql.types.{IntegralType, LongType}
 trait HashJoin {
   self: SparkPlan =>
 
-  val leftKeys: Seq[Expression]
-  val rightKeys: Seq[Expression]
-  val joinType: JoinType
-  val buildSide: BuildSide
-  val condition: Option[Expression]
-  val left: SparkPlan
-  val right: SparkPlan
+  def leftKeys: Seq[Expression]
+  def rightKeys: Seq[Expression]
+  def joinType: JoinType
+  def buildSide: BuildSide
+  def condition: Option[Expression]
+  def left: SparkPlan
+  def right: SparkPlan
 
   override def simpleStringWithNodeId(): String = {
     val opId = ExplainUtils.getOpId(this)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index e714554f108ff..67f075f0785fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.python
 import scala.collection.JavaConverters._
 
 import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.api.python.ChainedPythonFunctions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
@@ -61,7 +61,7 @@ private[spark] class BatchIterator[T](iter: Iterator[T], batchSize: Int)
  */
 case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], child: SparkPlan,
     evalType: Int)
-  extends EvalPythonExec(udfs, resultAttrs, child) {
+  extends EvalPythonExec {
 
   private val batchSize = conf.arrowMaxRecordsPerBatch
   private val sessionLocalTimeZone = conf.sessionLocalTimeZone
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 02bfbc4949b37..b6d8e59877f17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.{StructField, StructType}
  * A physical plan that evaluates a [[PythonUDF]]
  */
 case class BatchEvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], child: SparkPlan)
-  extends EvalPythonExec(udfs, resultAttrs, child) {
+  extends EvalPythonExec {
 
   protected override def evaluate(
       funcs: Seq[ChainedPythonFunctions],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
index a0f23e925d237..96e3bb721a822 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -57,8 +57,9 @@ import org.apache.spark.util.Utils
  * there should be always some rows buffered in the socket or Python process, so the pulling from
  * RowQueue ALWAYS happened after pushing into it.
  */
-abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], child: SparkPlan)
-  extends UnaryExecNode {
+trait EvalPythonExec extends UnaryExecNode {
+  def udfs: Seq[PythonUDF]
+  def resultAttrs: Seq[Attribute]
 
   override def output: Seq[Attribute] = child.output ++ resultAttrs
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala
index f54c4b8f22066..983fe9db73824 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala
@@ -84,7 +84,7 @@ case class WindowInPandasExec(
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
     child: SparkPlan)
-  extends WindowExecBase(windowExpression, partitionSpec, orderSpec, child) {
+  extends WindowExecBase {
 
   override def output: Seq[Attribute] =
     child.output ++ windowExpression.map(_.toAttribute)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index fe91d24912222..d65c4ffbb7a24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -54,7 +54,7 @@ case class FlatMapGroupsWithStateExec(
     outputMode: OutputMode,
     timeoutConf: GroupStateTimeout,
     batchTimestampMs: Option[Long],
-    override val eventTimeWatermark: Option[Long],
+    eventTimeWatermark: Option[Long],
     child: SparkPlan
   ) extends UnaryExecNode with ObjectProducerExec with StateStoreWriter with WatermarkSupport {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index 42fa07f4a6aee..eaca55df08d06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -93,7 +93,7 @@ case class WindowExec(
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
     child: SparkPlan)
-  extends WindowExecBase(windowExpression, partitionSpec, orderSpec, child) {
+  extends WindowExecBase {
 
   override def output: Seq[Attribute] =
     child.output ++ windowExpression.map(_.toAttribute)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
index 99c8917478865..ed055bb801ae5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
@@ -26,11 +26,10 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType, TimestampType}
 
-abstract class WindowExecBase(
-    windowExpression: Seq[NamedExpression],
-    partitionSpec: Seq[Expression],
-    orderSpec: Seq[SortOrder],
-    child: SparkPlan) extends UnaryExecNode {
+trait WindowExecBase extends UnaryExecNode {
+  def windowExpression: Seq[NamedExpression]
+  def partitionSpec: Seq[Expression]
+  def orderSpec: Seq[SortOrder]
 
   /**
    * Create the resulting projection.

From b2a74107d50a0f69d727049581e01d9e2f6b4778 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Thu, 27 Feb 2020 00:18:46 -0800
Subject: [PATCH 166/185] [SPARK-23435][INFRA][FOLLOW-UP] Remove unnecessary
 dependency in AppVeyor

### What changes were proposed in this pull request?
`testthat` version was pinned to `1.0.2` at https://github.com/apache/spark/commit/f15102b1702b64a54233ae31357e32335722f4e5 due to compatibility issue in SparkR.
The compatibility issue is finally fixed as of https://github.com/apache/spark/commit/298d0a5102e54ddc24f114e83d2b936762722eec and we now use testthat latest version.

Now we don't need to install `crayon', 'praise' and 'R6' as they are dependences in testthat (https://github.com/r-lib/testthat/blob/master/DESCRIPTION).

### Why are the changes needed?
To minimise build specification and prevent dependency confusion.

### Does this PR introduce any user-facing change?
No. Dev only change.

### How was this patch tested?
AppVeyor build will test it out.

Closes #27717 from HyukjinKwon/SPARK-23435-followup.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 appveyor.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 5d98260265b1a..fc0b7d53ddabc 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -42,8 +42,7 @@ install:
   # Install maven and dependencies
   - ps: .\dev\appveyor-install-dependencies.ps1
   # Required package for R unit tests
-  - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
-  - cmd: R -e "install.packages(c('crayon', 'praise', 'R6', 'testthat'), repos='https://cloud.r-project.org/')"
+  - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
   - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')"
 
 build_script:

From 22dfd15a4574a5cccdc54c96f11de28d58363016 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Thu, 27 Feb 2020 21:29:42 +0800
Subject: [PATCH 167/185] [SPARK-30937][DOC] Group Hive upgrade guides together

### What changes were proposed in this pull request?

This PR groups all hive upgrade related migration guides inside Spark 3.0 together.

Also add another behavior change of `ScriptTransform` in the new Hive section.

### Why are the changes needed?

Make the doc more clearly to user.

### Does this PR introduce any user-facing change?

No, new doc for Spark 3.0.

### How was this patch tested?

N/A.

Closes #27670 from Ngone51/hive_migration.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                            | 10 +++++++---
 .../sql/hive/execution/ScriptTransformationSuite.scala |  5 ++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 7e0a536d0f75f..d241a664357f6 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -254,7 +254,7 @@ license: |
         </tr>
     </table>
     
-  - Since Spark 3.0, CREATE TABLE without a specific provider will use the value of `spark.sql.sources.default` as its provider. In Spark version 2.4 and earlier, it was hive. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createHiveTableByDefault.enabled` to `true`.
+  - Since Spark 3.0, `CREATE TABLE` without a specific provider will use the value of `spark.sql.sources.default` as its provider. In Spark version 2.4 and earlier, it was hive. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createHiveTableByDefault.enabled` to `true`.
 
   - Since Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation will be coerced to double value, e.g. `+'1'` results `1.0`. In Spark version 2.4 and earlier, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, e.g. `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, e.g. in Spark 2.4, the result of `+'1'` is string `1`.
 
@@ -332,10 +332,14 @@ license: |
 
   - Since Spark 3.0, `SHOW CREATE TABLE` will always return Spark DDL, even when the given table is a Hive serde table. For generating Hive DDL, please use `SHOW CREATE TABLE AS SERDE` command instead.
 
-  - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3. This may need to set `spark.sql.hive.metastore.version` and `spark.sql.hive.metastore.jars` according to the version of the Hive metastore.
+  - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3 and it brings following impacts:
+  
+    - You may need to set `spark.sql.hive.metastore.version` and `spark.sql.hive.metastore.jars` according to the version of the Hive metastore you want to connect to.
   For example: set `spark.sql.hive.metastore.version` to `1.2.1` and `spark.sql.hive.metastore.jars` to `maven` if your Hive metastore version is 1.2.1.
   
-  - Since Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3. You need to migrate your custom SerDes to Hive 2.3 or build your own Spark with `hive-1.2` profile. See HIVE-15167 for more details.
+    - You need to migrate your custom SerDes to Hive 2.3 or build your own Spark with `hive-1.2` profile. See HIVE-15167 for more details.
+
+    - The decimal string representation can be different between Hive 1.2 and Hive 2.3 when using `TRANSFORM` operator in SQL for script transformation, which depends on hive's behavior. In Hive 1.2, the string representation omits trailing zeroes. But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.
 
 ## Upgrading from Spark SQL 2.4.4 to 2.4.5
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index 7d01fc53a4099..7153d3f03cd57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -212,9 +212,8 @@ class ScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with Tes
           |FROM v
         """.stripMargin)
 
-      // In Hive1.2, it does not do well on Decimal conversion. For example, in this case,
-      // it converts a decimal value's type from Decimal(38, 18) to Decimal(1, 0). So we need
-      // do extra cast here for Hive1.2. But in Hive2.3, it still keeps the original Decimal type.
+      // In Hive 1.2, the string representation of a decimal omits trailing zeroes.
+      // But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.
       val decimalToString: Column => Column = if (HiveUtils.isHive23) {
         c => c.cast("string")
       } else {

From 2d2706cb86ddccd2fc60378b0f47a437ec354017 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Thu, 27 Feb 2020 23:12:35 +0900
Subject: [PATCH 168/185] [SPARK-30956][SQL][TESTS] Use intercept instead of
 try-catch to assert failures in IntervalUtilsSuite

### What changes were proposed in this pull request?

In this PR, I addressed the comment from https://github.com/apache/spark/pull/27672#discussion_r383719562 to use `intercept` instead of `try-catch` block to assert  failures in the IntervalUtilsSuite

### Why are the changes needed?

improve tests
### Does this PR introduce any user-facing change?

no

### How was this patch tested?

Nah

Closes #27700 from yaooqinn/intervaltest.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../catalyst/util/IntervalUtilsSuite.scala    | 119 ++++--------------
 1 file changed, 26 insertions(+), 93 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
index 25e706097efd7..4943dc365d716 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala
@@ -35,27 +35,17 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     assert(safeStringToInterval(UTF8String.fromString(input)) === expected)
   }
 
-  private def checkFromStringWithFunc(
-      input: String,
-      months: Int,
-      days: Int,
-      us: Long,
-      func: CalendarInterval => CalendarInterval): Unit = {
-    val expected = new CalendarInterval(months, days, us)
-    assert(func(stringToInterval(UTF8String.fromString(input))) === expected)
-    assert(func(safeStringToInterval(UTF8String.fromString(input))) === expected)
+  private def checkFromInvalidString(input: String, errorMsg: String): Unit = {
+    failFuncWithInvalidInput(input, errorMsg, s => stringToInterval(UTF8String.fromString(s)))
+    assert(safeStringToInterval(UTF8String.fromString(input)) === null)
   }
 
-  private def checkFromInvalidString(input: String, errorMsg: String): Unit = {
-    try {
-      stringToInterval(UTF8String.fromString(input))
-      fail("Expected to throw an exception for the invalid input")
-    } catch {
-      case e: IllegalArgumentException =>
-        val msg = e.getMessage
-        assert(msg.contains(errorMsg))
+  private def failFuncWithInvalidInput(
+      input: String, errorMsg: String, converter: String => CalendarInterval): Unit = {
+    withClue("Expected to throw an exception for the invalid input") {
+      val e = intercept[IllegalArgumentException](converter(input))
+      assert(e.getMessage.contains(errorMsg))
     }
-    assert(safeStringToInterval(UTF8String.fromString(input)) === null)
   }
 
   private def testSingleUnit(
@@ -87,7 +77,6 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     }
   }
 
-
   test("string to interval: multiple units") {
     Seq(
       "-1 MONTH 1 day -1 microseconds" -> new CalendarInterval(-1, 1, -1),
@@ -145,22 +134,9 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     assert(fromYearMonthString("99-10") === new CalendarInterval(99 * 12 + 10, 0, 0L))
     assert(fromYearMonthString("+99-10") === new CalendarInterval(99 * 12 + 10, 0, 0L))
     assert(fromYearMonthString("-8-10") === new CalendarInterval(-8 * 12 - 10, 0, 0L))
-
-    try {
-      fromYearMonthString("99-15")
-      fail("Expected to throw an exception for the invalid input")
-    } catch {
-      case e: IllegalArgumentException =>
-        assert(e.getMessage.contains("month 15 outside range"))
-    }
-
-    try {
-      fromYearMonthString("9a9-15")
-      fail("Expected to throw an exception for the invalid input")
-    } catch {
-      case e: IllegalArgumentException =>
-        assert(e.getMessage.contains("Interval string does not match year-month format"))
-    }
+    failFuncWithInvalidInput("99-15", "month 15 outside range", fromYearMonthString)
+    failFuncWithInvalidInput("9a9-15", "Interval string does not match year-month format",
+      fromYearMonthString)
   }
 
   test("from day-time string - legacy") {
@@ -179,29 +155,10 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
           12 * MICROS_PER_MINUTE + millisToMicros(888)))
       assert(fromDayTimeString("-3 0:0:0") === new CalendarInterval(0, -3, 0L))
 
-      try {
-        fromDayTimeString("5 30:12:20")
-        fail("Expected to throw an exception for the invalid input")
-      } catch {
-        case e: IllegalArgumentException =>
-          assert(e.getMessage.contains("hour 30 outside range"))
-      }
-
-      try {
-        fromDayTimeString("5 30-12")
-        fail("Expected to throw an exception for the invalid input")
-      } catch {
-        case e: IllegalArgumentException =>
-          assert(e.getMessage.contains("must match day-time format"))
-      }
-
-      try {
-        fromDayTimeString("5 1:12:20", HOUR, MICROSECOND)
-        fail("Expected to throw an exception for the invalid convention type")
-      } catch {
-        case e: IllegalArgumentException =>
-          assert(e.getMessage.contains("Cannot support (interval"))
-      }
+      failFuncWithInvalidInput("5 30:12:20", "hour 30 outside range", fromDayTimeString)
+      failFuncWithInvalidInput("5 30-12", "must match day-time format", fromDayTimeString)
+      failFuncWithInvalidInput("5 1:12:20", "Cannot support (interval",
+        s => fromDayTimeString(s, HOUR, MICROSECOND))
     }
   }
 
@@ -215,13 +172,10 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     assert(duration("1 microsecond", TimeUnit.MICROSECONDS, 30) === 1)
     assert(duration("1 month -30 days", TimeUnit.DAYS, 31) === 1)
 
-    try {
+    val e = intercept[ArithmeticException] {
       duration(Integer.MAX_VALUE + " month", TimeUnit.SECONDS, 31)
-      fail("Expected to throw an exception for the invalid input")
-    } catch {
-      case e: ArithmeticException =>
-        assert(e.getMessage.contains("overflow"))
     }
+    assert(e.getMessage.contains("overflow"))
   }
 
   test("negative interval") {
@@ -283,12 +237,9 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
 
     val interval = new CalendarInterval(2, 0, 0)
     assert(multiply(interval, Integer.MAX_VALUE) === new CalendarInterval(Int.MaxValue, 0, 0))
-    try {
-      multiplyExact(interval, Integer.MAX_VALUE)
-      fail("Expected to throw an exception on months overflow")
-    } catch {
-      case e: ArithmeticException => assert(e.getMessage.contains("overflow"))
-    }
+
+    val e = intercept[ArithmeticException](multiplyExact(interval, Integer.MAX_VALUE))
+    assert(e.getMessage.contains("overflow"))
   }
 
   test("divide by num") {
@@ -307,21 +258,13 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
     var interval = new CalendarInterval(Int.MaxValue, Int.MaxValue, 0)
     assert(divide(interval, 0.9) === new CalendarInterval(Int.MaxValue, Int.MaxValue,
       ((Int.MaxValue / 9.0) * MICROS_PER_DAY).round))
-    try {
-      divideExact(interval, 0.9)
-      fail("Expected to throw an exception on integer overflow")
-    } catch {
-      case e: ArithmeticException => assert(e.getMessage.contains("integer overflow"))
-    }
+    val e1 = intercept[ArithmeticException](divideExact(interval, 0.9))
+    assert(e1.getMessage.contains("integer overflow"))
 
     interval = new CalendarInterval(123, 456, 789)
     assert(divide(interval, 0) === null)
-    try {
-      divideExact(interval, 0)
-      fail("Expected to throw an exception on divide by zero")
-    } catch {
-      case e: ArithmeticException => assert(e.getMessage.contains("divide by zero"))
-    }
+    val e2 = intercept[ArithmeticException](divideExact(interval, 0))
+    assert(e2.getMessage.contains("divide by zero"))
   }
 
   test("from day-time string") {
@@ -331,18 +274,8 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {
         assert(fromDayTimeString(input, from, to) === safeStringToInterval(expectedUtf8))
       }
     }
-    def checkFail(
-        input: String,
-        from: IntervalUnit,
-        to: IntervalUnit,
-        errMsg: String): Unit = {
-      try {
-        fromDayTimeString(input, from, to)
-        fail("Expected to throw an exception for the invalid input")
-      } catch {
-        case e: IllegalArgumentException =>
-          assert(e.getMessage.contains(errMsg))
-      }
+    def checkFail(input: String, from: IntervalUnit, to: IntervalUnit, errMsg: String): Unit = {
+      failFuncWithInvalidInput(input, errMsg, s => fromDayTimeString(s, from, to))
     }
 
     check("12:40", HOUR, MINUTE, "12 hours 40 minutes")

From 6f4a2e4c99dadf111e43c6e5b4d7ee5e4d4bd8f6 Mon Sep 17 00:00:00 2001
From: Tigran Saluev <tigran@saluev.com>
Date: Thu, 27 Feb 2020 11:05:53 -0600
Subject: [PATCH 169/185] [MINOR][ML] Fix confusing error message in
 VectorAssembler

### What changes were proposed in this pull request?

When VectorAssembler encounters a NULL with handleInvalid="error", it throws an exception. This exception, though, has a typo making it confusing. Yet apparently, this same exception for NaN values is fine. Fixed it to look like the right one.

### Why are the changes needed?

Encountering this error with such message was very confusing! I hope to save time of fellow engineers by improving it.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

It's just an error message...

Closes #27709 from Saluev/patch-1.

Authored-by: Tigran Saluev <tigran@saluev.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../scala/org/apache/spark/ml/feature/VectorAssembler.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 51acbf3806b9d..30700122665de 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -288,7 +288,7 @@ object VectorAssembler extends DefaultParamsReadable[VectorAssembler] {
           featureIndex += length
         } else {
           throw new SparkException(
-            s"""Encountered null while assembling a row with handleInvalid = "keep". Consider
+            s"""Encountered null while assembling a row with handleInvalid = "error". Consider
                |removing nulls from dataset or using handleInvalid = "keep" or "skip"."""
               .stripMargin)
         }

From 3995728c3ce9d85b0436c0220f957b9d9133d64a Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Thu, 27 Feb 2020 17:05:56 -0800
Subject: [PATCH 170/185] [SPARK-30968][BUILD] Upgrade aws-java-sdk-sts to
 1.11.655

### What changes were proposed in this pull request?

This PR aims to upgrade `aws-java-sdk-sts` to `1.11.655`.

### Why are the changes needed?

[SPARK-29677](https://github.com/apache/spark/pull/26333) upgrades AWS Kinesis Client to 1.12.0 for Apache Spark 2.4.5 and 3.0.0.

Since AWS Kinesis Client 1.12.0 is using AWS SDK 1.11.665, `aws-java-sdk-sts` should be consistent with Kinesis client dependency.
- https://github.com/awslabs/amazon-kinesis-client/releases/tag/v1.12.0

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass the Jenkins.

Closes #27720 from dongjoon-hyun/SPARK-30968.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index bad24ef17fbfd..1c786dedb8975 100644
--- a/pom.xml
+++ b/pom.xml
@@ -149,7 +149,7 @@
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <aws.kinesis.client.version>1.12.0</aws.kinesis.client.version>
     <!-- Should be consistent with Kinesis client dependency -->
-    <aws.java.sdk.version>1.11.271</aws.java.sdk.version>
+    <aws.java.sdk.version>1.11.655</aws.java.sdk.version>
     <!-- the producer is used in tests -->
     <aws.kinesis.producer.version>0.12.8</aws.kinesis.producer.version>
     <!--  org.apache.httpcomponents/httpclient-->

From 314442a3d56dd77d1817b1b105bc61b76cb5efaf Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 27 Feb 2020 17:19:27 -0800
Subject: [PATCH 171/185] [SQL][MINOR][TESTS] Remove GivenWhenThen trait from
 HiveComparisonTest

### What changes were proposed in this pull request?

This pr intends to remove non-used trait, `GivenWhenThen`, from `HiveComparisonTest`.

### Why are the changes needed?

For better code.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Existing tests.

Closes #27726 from maropu/MINOR-20200228.

Authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
---
 .../apache/spark/sql/hive/execution/HiveComparisonTest.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 82fe274515505..a30fa576fc92d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -24,7 +24,7 @@ import java.util.Locale
 
 import scala.util.control.NonFatal
 
-import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Dataset
@@ -46,8 +46,7 @@ import org.apache.spark.sql.hive.test.{TestHive, TestHiveQueryExecution}
  * See the documentation of public vals in this class for information on how test execution can be
  * configured using system properties.
  */
-abstract class HiveComparisonTest
-  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen {
+abstract class HiveComparisonTest extends SparkFunSuite with BeforeAndAfterAll {
 
   override protected val enableAutoThreadAudit = false
 

From ba032acf9534ea7da75429ebcd4c4912d90cd26c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 28 Feb 2020 12:29:46 +0900
Subject: [PATCH 172/185] [SPARK-30955][SQL] Exclude Generate output when
 aliasing in nested column pruning

### What changes were proposed in this pull request?

When aliasing in nested column pruning in Project on top of Generate, we should exclude Generate outputs.

### Why are the changes needed?

Right now we would prune nested columns in Project on top of Generate. It is possible that referred nested columns are from Generate's outputs, not from its child. To address that case, we should exclude Generate outputs when aliasing in nested column pruning.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Unit test.

Closes #27702 from viirya/fix-nested-pruning.

Lead-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Co-authored-by: Liang-Chi Hsieh <liangchi@uber.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../optimizer/NestedColumnAliasing.scala      |  8 +++-
 .../sql/catalyst/optimizer/Optimizer.scala    |  3 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 43 ++++++++++++++++---
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index 46e07f3843925..9814bc247fc87 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -104,10 +104,13 @@ object NestedColumnAliasing {
   /**
    * Return two maps in order to replace nested fields to aliases.
    *
+   * If `exclusiveAttrs` is given, any nested field accessors of these attributes
+   * won't be considered in nested fields aliasing.
+   *
    * 1. ExtractValue -> Alias: A new alias is created for each nested field.
    * 2. ExprId -> Seq[Alias]: A reference attribute has multiple aliases pointing it.
    */
-  def getAliasSubMap(exprList: Seq[Expression])
+  def getAliasSubMap(exprList: Seq[Expression], exclusiveAttrs: Seq[Attribute] = Seq.empty)
     : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = {
     val (nestedFieldReferences, otherRootReferences) =
       exprList.flatMap(collectRootReferenceAndExtractValue).partition {
@@ -115,8 +118,9 @@ object NestedColumnAliasing {
         case _ => false
       }
 
+    val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences)
     val aliasSub = nestedFieldReferences.asInstanceOf[Seq[ExtractValue]]
-      .filter(!_.references.subsetOf(AttributeSet(otherRootReferences)))
+      .filter(!_.references.subsetOf(exclusiveAttrSet))
       .groupBy(_.references.head)
       .flatMap { case (attr, nestedFields: Seq[ExtractValue]) =>
         // Each expression can contain multiple nested fields.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 08acac18f48bb..30ad6bfe3b715 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -612,7 +612,8 @@ object ColumnPruning extends Rule[LogicalPlan] {
     // prune unrequired nested fields
     case p @ Project(projectList, g: Generate) if SQLConf.get.nestedPruningOnExpressions &&
         NestedColumnAliasing.canPruneGenerator(g.generator) =>
-      NestedColumnAliasing.getAliasSubMap(projectList ++ g.generator.children).map {
+      val exprsToPrune = projectList ++ g.generator.children
+      NestedColumnAliasing.getAliasSubMap(exprsToPrune, g.qualifiedGeneratorOutput).map {
         case (nestedFieldToAlias, attrToAliases) =>
           val newGenerator = g.generator.transform {
             case f: ExtractValue if nestedFieldToAlias.contains(f) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 78ceeaa09c125..383861a448683 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -28,7 +28,8 @@ import org.apache.spark.{AccumulatorSuite, SparkException}
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Partial}
-import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
+import org.apache.spark.sql.catalyst.optimizer.{ConvertToLocalRelation, NestedColumnAliasingSuite}
+import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.HiveResult.hiveResultString
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
@@ -3413,15 +3414,43 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
     }
   }
 
-  test("SPARK-30870: Column pruning shouldn't alias a nested column if it means the whole " +
-    "structure") {
-    val df = sql(
+  test("SPARK-30870: Column pruning shouldn't alias a nested column for the whole structure") {
+    withTable("t") {
+      val df = sql(
+        """
+          |SELECT value
+          |FROM VALUES array(named_struct('field', named_struct('a', 1, 'b', 2))) AS (value)
+        """.stripMargin)
+      df.write.format("parquet").saveAsTable("t")
+
+      val df2 = spark.table("t")
+        .limit(100)
+        .select(size(col("value.field")))
+      val projects = df2.queryExecution.optimizedPlan.collect {
+        case p: Project => p
+      }
+      assert(projects.length == 1)
+      val aliases = NestedColumnAliasingSuite.collectGeneratedAliases(projects(0))
+      assert(aliases.length == 0)
+    }
+  }
+
+  test("SPARK-30955: Exclude Generate output when aliasing in nested column pruning") {
+    val df1 = sql(
+      """
+        |SELECT explodedvalue.*
+        |FROM VALUES array(named_struct('nested', named_struct('a', 1, 'b', 2))) AS (value)
+        |LATERAL VIEW explode(value) AS explodedvalue
+      """.stripMargin)
+    checkAnswer(df1, Row(Row(1, 2)) :: Nil)
+
+    val df2 = sql(
       """
-        |SELECT explodedvalue.field
-        |FROM VALUES array(named_struct('field', named_struct('a', 1, 'b', 2))) AS (value)
+        |SELECT explodedvalue.nested.a
+        |FROM VALUES array(named_struct('nested', named_struct('a', 1, 'b', 2))) AS (value)
         |LATERAL VIEW explode(value) AS explodedvalue
       """.stripMargin)
-    checkAnswer(df, Row(Row(1, 2)) :: Nil)
+    checkAnswer(df2, Row(1) :: Nil)
   }
 }
 

From a1d2ce90b0954a6ff2da630893d48ba0e7594b08 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 28 Feb 2020 11:50:20 +0800
Subject: [PATCH 173/185] [SPARK-30972][SQL] PruneHiveTablePartitions should be
 executed as earlyScanPushDownRules

### What changes were proposed in this pull request?

Make rule `PruneHiveTablePartitions` to execute as `earlyScanPushDownRules`.

### Why are the changes needed?

Similar to rule `PruneFileSourcePartitions`, `PruneHiveTablePartitions` should also be executed as earlyScanPushDownRules to eliminate the impact on statistic computation later.

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Pass Jenkins.

Closes #27723 from Ngone51/early_hive_prune.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/OptimizeMetadataOnlyQuery.scala  | 12 ++++++++----
 .../sql/internal/BaseSessionStateBuilder.scala     | 11 +++++++++++
 .../spark/sql/hive/HiveSessionStateBuilder.scala   | 14 ++------------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 45e5f415e8da1..47284f3448b7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -133,10 +133,14 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
               CaseInsensitiveMap(relation.tableMeta.storage.properties)
             val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION)
               .getOrElse(SQLConf.get.sessionLocalTimeZone)
-            val partitions = if (partFilters.nonEmpty) {
-              catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters)
-            } else {
-              catalog.listPartitions(relation.tableMeta.identifier)
+            val partitions = relation.prunedPartitions match {
+              // for the case where partitions have already been pruned by PruneHiveTablePartitions
+              case Some(parts) => parts
+              case None => if (partFilters.nonEmpty) {
+                catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters)
+              } else {
+                catalog.listPartitions(relation.tableMeta.identifier)
+              }
             }
 
             val partitionData = partitions.map { p =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 20e1b5694e952..9556d4d1ecabf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -232,6 +232,9 @@ abstract class BaseSessionStateBuilder(
    */
   protected def optimizer: Optimizer = {
     new SparkOptimizer(catalogManager, catalog, experimentalMethods) {
+      override def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] =
+        super.earlyScanPushDownRules ++ customEarlyScanPushDownRules
+
       override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] =
         super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules
     }
@@ -247,6 +250,14 @@ abstract class BaseSessionStateBuilder(
     extensions.buildOptimizerRules(session)
   }
 
+  /**
+   * Custom early scan push down rules to add to the Optimizer. Prefer overriding this instead
+   * of creating your own Optimizer.
+   *
+   * Note that this may NOT depend on the `optimizer` function.
+   */
+  protected def customEarlyScanPushDownRules: Seq[Rule[LogicalPlan]] = Nil
+
   /**
    * Planner that converts optimized logical plans to physical plans.
    *
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 7ec52ba31c029..64726755237a6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -98,18 +98,8 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
         customCheckRules
   }
 
-  /**
-   * Logical query plan optimizer that takes into account Hive.
-   */
-  override protected def optimizer: Optimizer = {
-    new SparkOptimizer(catalogManager, catalog, experimentalMethods) {
-      override def postHocOptimizationBatches: Seq[Batch] = super.postHocOptimizationBatches ++
-        Seq(Batch("Prune Hive Table Partitions", Once, new PruneHiveTablePartitions(session)))
-
-      override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] =
-        super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules
-    }
-  }
+  override def customEarlyScanPushDownRules: Seq[Rule[LogicalPlan]] =
+    Seq(new PruneHiveTablePartitions(session))
 
   /**
    * Planner that takes into account Hive-specific strategies.

From c467961e8ab49967d1e148acdd9effba67e34332 Mon Sep 17 00:00:00 2001
From: zero323 <mszymkiewicz@gmail.com>
Date: Fri, 28 Feb 2020 12:58:56 +0900
Subject: [PATCH 174/185] [SPARK-30682][SPARKR][SQL] Add SparkR interface for
 higher order functions

### What changes were proposed in this pull request?

This PR add R API for invoking following higher functions:

- `transform` -> `array_transform` (to avoid conflict with `base::transform`).
- `exists` -> `array_exists` (to avoid conflict with `base::exists`).
- `forall` -> `array_forall` (no conflicts, renamed for consistency)
- `filter` -> `array_filter` (to avoid conflict with `stats::filter`).
- `aggregate` -> `array_aggregate` (to avoid conflict with `stats::transform`).
- `zip_with` -> `arrays_zip_with` (no conflicts, renamed for consistency)
- `transform_keys`
- `transform_values`
- `map_filter`
- `map_zip_with`

Overall implementation follows the same pattern as proposed for PySpark (#27406) and reuses object supporting Scala implementation (SPARK-27297).

### Why are the changes needed?

Currently higher order functions are available only using SQL and Scala API and can use only SQL expressions:

```r
select(df, expr("transform(xs, x -> x + 1)")
```

This is error-prone, and hard to do right, when complex logic is used (`when` / `otherwise`, complex objects).

If this PR is accepted, above function could be simply rewritten as:

```r
select(df, transform("xs", function(x) x + 1))
```

### Does this PR introduce any user-facing change?

No (but new user-facing functions are added).

### How was this patch tested?

Added new unit tests.

Closes #27433 from zero323/SPARK-30682.

Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 R/pkg/NAMESPACE                       |  10 +
 R/pkg/R/functions.R                   | 307 ++++++++++++++++++++++++++
 R/pkg/R/generics.R                    |  40 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R |  64 ++++++
 4 files changed, 421 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7ed2e36d59531..2f7b876f0ec33 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -199,9 +199,13 @@ exportMethods("%<=>%",
               "approx_count_distinct",
               "approxCountDistinct",
               "approxQuantile",
+              "array_aggregate",
               "array_contains",
               "array_distinct",
               "array_except",
+              "array_exists",
+              "array_filter",
+              "array_forall",
               "array_intersect",
               "array_join",
               "array_max",
@@ -210,9 +214,11 @@ exportMethods("%<=>%",
               "array_remove",
               "array_repeat",
               "array_sort",
+              "array_transform",
               "arrays_overlap",
               "array_union",
               "arrays_zip",
+              "arrays_zip_with",
               "asc",
               "ascii",
               "asin",
@@ -314,10 +320,12 @@ exportMethods("%<=>%",
               "ltrim",
               "map_concat",
               "map_entries",
+              "map_filter",
               "map_from_arrays",
               "map_from_entries",
               "map_keys",
               "map_values",
+              "map_zip_with",
               "max",
               "md5",
               "mean",
@@ -396,6 +404,8 @@ exportMethods("%<=>%",
               "to_timestamp",
               "to_utc_timestamp",
               "translate",
+              "transform_keys",
+              "transform_values",
               "trim",
               "trunc",
               "unbase64",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 48f69d5769620..0ecf688a636d1 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -219,6 +219,34 @@ NULL
 #'              the DDL-formatted string literal can also be accepted.
 #'          \item \code{from_csv}: a structType object, DDL-formatted string or \code{schema_of_csv}
 #'          }
+#'
+#' @param f a \code{function} mapping from \code{Column(s)} to \code{Column}.
+#'          \itemize{
+#'          \item \code{array_exists}
+#'          \item \code{array_filter} the Boolean \code{function} used to filter the data.
+#'            Either unary or binary. In the latter case the second argument
+#'            is the index in the array (0-based).
+#'          \item \code{array_forall} the Boolean unary \code{function} used to filter the data.
+#'          \item \code{array_transform} a \code{function} used to transform the data.
+#'            Either unary or binary. In the latter case the second argument
+#'            is the index in the array (0-based).
+#'          \item \code{arrays_zip_with}
+#'          \item \code{map_zip_with}
+#'          \item \code{map_filter} the Boolean binary \code{function} used to filter the data.
+#'            The first argument is the key, the second argument is the value.
+#'          \item \code{transform_keys} a binary \code{function}
+#'            used to transform the data.  The first argument is the key, the second argument
+#'            is the value.
+#'          \item \code{transform_values} a binary \code{function}
+#'            used to transform the data.  The first argument is the key, the second argument
+#'            is the value.
+#'          }
+#' @param zero a \code{Column} used as the initial value in \code{array_aggregate}
+#' @param merge a \code{function} a binary function \code{(Column, Column) -> Column}
+#'          used in \code{array_aggregate}to merge values (the second argument)
+#'          into accumulator (the first argument).
+#' @param finish an unary \code{function} \code{(Column) -> Column} used to
+#'          apply final transformation on the accumulated data in \code{array_aggregate}.
 #' @param ... additional argument(s).
 #'          \itemize{
 #'          \item \code{to_json}, \code{from_json} and \code{schema_of_json}: this contains
@@ -244,6 +272,14 @@ NULL
 #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
 #' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
 #' head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21)))
+#' head(select(tmp, array_transform("v1", function(x) x * 10)))
+#' head(select(tmp, array_exists("v1", function(x) x > 120)))
+#' head(select(tmp, array_forall("v1", function(x) x >= 8.0)))
+#' head(select(tmp, array_filter("v1", function(x) x < 10)))
+#' head(select(tmp, array_aggregate("v1", lit(0), function(acc, y) acc + y)))
+#' head(select(
+#'   tmp,
+#'   array_aggregate("v1", lit(0), function(acc, y) acc + y, function(acc) acc / 10)))
 #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
 #' head(tmp2)
 #' head(select(tmp, posexplode(tmp$v1)))
@@ -253,17 +289,22 @@ NULL
 #' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
 #' head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
 #' head(select(tmp3, element_at(tmp3$v3, "Valiant"), map_concat(tmp3$v3, tmp3$v3)))
+#' head(select(tmp3, transform_keys("v3", function(k, v) upper(k))))
+#' head(select(tmp3, transform_values("v3", function(k, v) v * 10)))
+#' head(select(tmp3, map_filter("v3", function(k, v) v < 42)))
 #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
 #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
 #' head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
 #' head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
 #' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5)))
 #' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
+#' head(select(tmp4, arrays_zip_with(tmp4$v4, tmp4$v5, function(x, y) x * y)))
 #' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
 #' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))
 #' tmp6 <- mutate(df, v7 = create_array(create_array(df$model, df$model)))
 #' head(select(tmp6, flatten(tmp6$v7)))
 #' tmp7 <- mutate(df, v8 = create_array(df$model, df$cyl), v9 = create_array(df$model, df$hp))
+#' head(select(tmp7, arrays_zip_with("v8", "v9", function(x, y) (x * y) %% 3)))
 #' head(select(tmp7, map_from_arrays(tmp7$v8, tmp7$v9)))
 #' tmp8 <- mutate(df, v10 = create_array(struct(df$model, df$cyl)))
 #' head(select(tmp8, map_from_entries(tmp8$v10)))}
@@ -3281,6 +3322,121 @@ setMethod("row_number",
 
 ###################### Collection functions######################
 
+#' Create o.a.s.sql.expressions.UnresolvedNamedLambdaVariable,
+#' convert it to o.s.sql.Column and wrap with R Column.
+#' Used by higher order functions.
+#'
+#' @param ... character of length = 1
+#'        if length(...) > 1 then argument is interpreted as a nested
+#'        Column, for example \code{unresolved_named_lambda_var("a", "b", "c")}
+#'        yields unresolved \code{a.b.c}
+#' @return Column object wrapping JVM UnresolvedNamedLambdaVariable
+unresolved_named_lambda_var <- function(...) {
+  jc <- newJObject(
+    "org.apache.spark.sql.Column",
+    newJObject(
+      "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
+      list(...)
+    )
+  )
+  column(jc)
+}
+
+#' Create o.a.s.sql.expressions.LambdaFunction corresponding
+#' to transformation described by func.
+#' Used by higher order functions.
+#'
+#' @param fun R \code{function} (unary, binary or ternary)
+#'        that transforms \code{Columns} into a \code{Column}
+#' @return JVM \code{LambdaFunction} object
+create_lambda <- function(fun) {
+  as_jexpr <- function(x) callJMethod(x@jc, "expr")
+
+  # Process function arguments
+  parameters <- formals(fun)
+  nparameters <- length(parameters)
+
+  stopifnot(
+    nparameters >= 1 &
+    nparameters <= 3 &
+    !"..." %in% names(parameters)
+  )
+
+  args <- lapply(c("x", "y", "z")[seq_along(parameters)], function(p) {
+      unresolved_named_lambda_var(p)
+  })
+
+  # Invoke function and validate return type
+  result <- do.call(fun, args)
+  stopifnot(class(result) == "Column")
+
+  # Convert both Columns to Scala expressions
+  jexpr <- as_jexpr(result)
+
+  jargs <- handledCallJStatic(
+    "org.apache.spark.api.python.PythonUtils",
+    "toSeq",
+    handledCallJStatic(
+      "java.util.Arrays", "asList", lapply(args, as_jexpr)
+    )
+  )
+
+  # Create Scala LambdaFunction
+  newJObject(
+    "org.apache.spark.sql.catalyst.expressions.LambdaFunction",
+    jexpr,
+    jargs,
+    FALSE
+  )
+}
+
+#' Invokes higher order function expression identified by name,
+#' (relative to o.a.s.sql.catalyst.expressions)
+#'
+#' @param name character
+#' @param cols list of character or Column objects
+#' @param funs list of named list(fun = ..., expected_narg = ...)
+#' @return a \code{Column} representing name applied to cols with funs
+invoke_higher_order_function <- function(name, cols, funs) {
+  as_jexpr <- function(x) {
+    if (class(x) == "character") {
+      x <- column(x)
+    }
+    callJMethod(x@jc, "expr")
+  }
+
+  jexpr <- do.call(newJObject, c(
+    paste("org.apache.spark.sql.catalyst.expressions", name, sep = "."),
+    lapply(cols, as_jexpr),
+    lapply(funs, create_lambda)
+  ))
+
+  column(newJObject("org.apache.spark.sql.Column", jexpr))
+}
+
+#' @details
+#' \code{array_aggregate}  Applies a binary operator to an initial state
+#' and all elements in the array, and reduces this to a single state.
+#' The final state is converted into the final result by applying
+#' a finish function.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_aggregate array_aggregate,characterOrColumn,Column,function-method
+#' @note array_aggregate since 3.1.0
+setMethod("array_aggregate",
+          signature(x = "characterOrColumn", zero = "Column", merge = "function"),
+          function(x, zero, merge, finish = NULL) {
+            invoke_higher_order_function(
+              "ArrayAggregate",
+              cols = list(x, zero),
+              funs = if (is.null(finish)) {
+                list(merge)
+              } else {
+                list(merge, finish)
+              }
+            )
+          })
+
 #' @details
 #' \code{array_contains}: Returns null if the array is null, true if the array contains
 #' the value, and false otherwise.
@@ -3322,6 +3478,54 @@ setMethod("array_except",
             column(jc)
           })
 
+#' @details
+#' \code{array_exists} Returns whether a predicate holds for one or more elements in the array.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_exists array_exists,characterOrColumn,function-method
+#' @note array_exists since 3.1.0
+setMethod("array_exists",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "ArrayExists",
+              cols = list(x),
+              funs = list(f)
+            )
+          })
+
+#' @details
+#' \code{array_filter} Returns an array of elements for which a predicate holds in a given array.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_filter array_filter,characterOrColumn,function-method
+#' @note array_filter since 3.1.0
+setMethod("array_filter",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "ArrayFilter",
+              cols = list(x),
+              funs = list(f)
+            )
+          })
+
+#' @details
+#' \code{array_forall} Returns whether a predicate holds for every element in the array.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_forall array_forall,characterOrColumn,function-method
+#' @note array_forall since 3.1.0
+setMethod("array_forall",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "ArrayForAll",
+              cols = list(x),
+              funs = list(f)
+            )
+          })
+
 #' @details
 #' \code{array_intersect}: Returns an array of the elements in the intersection of the given two
 #'  arrays, without duplicates.
@@ -3446,6 +3650,23 @@ setMethod("array_sort",
             column(jc)
           })
 
+#' @details
+#' \code{array_transform}  Returns an array of elements after applying
+#' a transformation to each element in the input array.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_transform array_transform,characterOrColumn,characterOrColumn,function-method
+#' @note array_transform since 3.1.0
+setMethod("array_transform",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "ArrayTransform",
+              cols = list(x),
+              funs = list(f)
+            )
+          })
+
 #' @details
 #' \code{arrays_overlap}: Returns true if the input arrays have at least one non-null element in
 #' common. If not and both arrays are non-empty and any of them contains a null, it returns null.
@@ -3493,6 +3714,24 @@ setMethod("arrays_zip",
             column(jc)
           })
 
+#' @details
+#' \code{arrays_zip_with} Merge two given arrays, element-wise, into a single array
+#' using a function. If one array is shorter, nulls are appended at the end
+#' to match the length of the longer array, before applying the function.
+#'
+#' @rdname column_collection_functions
+#' @aliases arrays_zip_with arrays_zip_with,characterOrColumn,characterOrColumn,function-method
+#' @note zip_with since 3.1.0
+setMethod("arrays_zip_with",
+          signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
+          function(x, y, f) {
+            invoke_higher_order_function(
+              "ZipWith",
+              cols = list(x, y),
+              funs = list(f)
+            )
+          })
+
 #' @details
 #' \code{shuffle}: Returns a random permutation of the given array.
 #'
@@ -3550,6 +3789,21 @@ setMethod("map_entries",
             column(jc)
          })
 
+#' @details
+#' \code{map_filter} Returns a map whose key-value pairs satisfy a predicate.
+#'
+#' @rdname column_collection_functions
+#' @aliases map_filter map_filter,characterOrColumn,function-method
+#' @note map_filter since 3.1.0
+setMethod("map_filter",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "MapFilter",
+              cols = list(x),
+              funs = list(f))
+          })
+
 #' @details
 #' \code{map_from_arrays}: Creates a new map column. The array in the first column is used for
 #' keys. The array in the second column is used for values. All elements in the array for key
@@ -3591,6 +3845,41 @@ setMethod("map_keys",
             column(jc)
          })
 
+#' @details
+#' \code{transform_keys} Applies a function to every key-value pair in a map and returns
+#' a map with the results of those applications as the new keys for the pairs.
+#'
+#' @rdname column_collection_functions
+#' @aliases transform_keys transform_keys,characterOrColumn,function-method
+#' @note transform_keys since 3.1.0
+setMethod("transform_keys",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "TransformKeys",
+              cols = list(x),
+              funs = list(f)
+            )
+          })
+
+#' @details
+#' \code{transform_values}    Applies a function to every key-value pair in a map and returns
+#' a map with the results of those applications as the new values for the pairs.
+#'
+#' @rdname column_collection_functions
+#' @aliases transform_values transform_values,characterOrColumn,function-method
+#' @note transform_values since 3.1.0
+setMethod("transform_values",
+          signature(x = "characterOrColumn", f = "function"),
+          function(x, f) {
+            invoke_higher_order_function(
+              "TransformValues",
+              cols = list(x),
+              funs = list(f)
+           )
+          })
+
+
 #' @details
 #' \code{map_values}: Returns an unordered array containing the values of the map.
 #'
@@ -3604,6 +3893,24 @@ setMethod("map_values",
             column(jc)
           })
 
+#' @details
+#' \code{map_zip} Merge two given maps, key-wise into a single map using a function.
+#'
+#' @rdname column_collection_functions
+#' @aliases map_zip_with map_zip_with,characterOrColumn,characterOrColumn,function-method
+#'
+#' @examples
+#' @note map_zip_with since 3.1.0
+setMethod("map_zip_with",
+          signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
+          function(x, y, f) {
+            invoke_higher_order_function(
+              "MapZipWith",
+              cols = list(x, y),
+              funs = list(f)
+           )
+          })
+
 #' @details
 #' \code{element_at}: Returns element of array at given index in \code{extraction} if
 #' \code{x} is array. Returns value for the given key in \code{extraction} if \code{x} is map.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 4134d5cecc888..a52ec7a4a27c1 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -757,6 +757,10 @@ setGeneric("approx_count_distinct", function(x, ...) { standardGeneric("approx_c
 #' @name NULL
 setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_aggregate", function(x, zero, merge, ...) { standardGeneric("array_aggregate") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
@@ -769,6 +773,18 @@ setGeneric("array_distinct", function(x) { standardGeneric("array_distinct") })
 #' @name NULL
 setGeneric("array_except", function(x, y) { standardGeneric("array_except") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_exists", function(x, f) { standardGeneric("array_exists") })
+
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_forall", function(x, f) { standardGeneric("array_forall") })
+
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_filter", function(x, f) { standardGeneric("array_filter") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("array_intersect", function(x, y) { standardGeneric("array_intersect") })
@@ -801,6 +817,10 @@ setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat")
 #' @name NULL
 setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_transform", function(x, f) { standardGeneric("array_transform") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })
@@ -813,6 +833,10 @@ setGeneric("array_union", function(x, y) { standardGeneric("array_union") })
 #' @name NULL
 setGeneric("arrays_zip", function(x, ...) { standardGeneric("arrays_zip") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("arrays_zip_with", function(x, y, f) { standardGeneric("arrays_zip_with") })
+
 #' @rdname column_string_functions
 #' @name NULL
 setGeneric("ascii", function(x) { standardGeneric("ascii") })
@@ -1086,6 +1110,10 @@ setGeneric("map_concat", function(x, ...) { standardGeneric("map_concat") })
 #' @name NULL
 setGeneric("map_entries", function(x) { standardGeneric("map_entries") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("map_filter", function(x, f) { standardGeneric("map_filter") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("map_from_arrays", function(x, y) { standardGeneric("map_from_arrays") })
@@ -1102,6 +1130,10 @@ setGeneric("map_keys", function(x) { standardGeneric("map_keys") })
 #' @name NULL
 setGeneric("map_values", function(x) { standardGeneric("map_values") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("map_zip_with", function(x, y, f) { standardGeneric("map_zip_with") })
+
 #' @rdname column_misc_functions
 #' @name NULL
 setGeneric("md5", function(x) { standardGeneric("md5") })
@@ -1314,6 +1346,14 @@ setGeneric("substring_index", function(x, delim, count) { standardGeneric("subst
 #' @name NULL
 setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("transform_keys", function(x, f) {  standardGeneric("transform_keys") })
+
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("transform_values", function(x, f) { standardGeneric("transform_values") })
+
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("degrees", function(x) { standardGeneric("degrees") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index c1d277ac84be1..0f26184fa3e9e 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1994,6 +1994,70 @@ test_that("when(), otherwise() and ifelse() with column on a DataFrame", {
   expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, lit(0), lit(1))))[, 1], c(1, 0))
 })
 
+test_that("higher order functions", {
+  df <- select(
+    createDataFrame(data.frame(id = 1)),
+    expr("CAST(array(1.0, 2.0, -3.0, -4.0) AS array<double>) xs"),
+    expr("CAST(array(0.0, 3.0, 48.0) AS array<double>) ys"),
+    expr("array('FAILED', 'SUCCEDED') as vs"),
+    expr("map('foo', 1, 'bar', 2) as mx"),
+    expr("map('foo', 42, 'bar', -1, 'baz', 0) as my")
+  )
+
+  map_to_sorted_array <- function(x) {
+    sort_array(arrays_zip(map_keys(x), map_values(x)))
+  }
+
+  result <- collect(select(
+    df,
+    array_transform("xs", function(x) x + 1) == expr("transform(xs, x -> x + 1)"),
+    array_transform("xs", function(x, i) otherwise(when(i %% 2 == 0, x), -x)) ==
+      expr("transform(xs, (x, i) -> CASE WHEN ((i % 2.0) = 0.0) THEN x ELSE (- x) END)"),
+    array_exists("vs", function(v) rlike(v, "FAILED")) ==
+      expr("exists(vs, v -> (v RLIKE 'FAILED'))"),
+    array_forall("xs", function(x) x > 0) ==
+      expr("forall(xs, x -> x > 0)"),
+    array_filter("xs", function(x, i) x > 0 | i %% 2 == 0) ==
+      expr("filter(xs, (x, i) ->  x > 0 OR i % 2 == 0)"),
+    array_filter("xs", function(x) signum(x) > 0) ==
+      expr("filter(xs, x -> signum(x) > 0)"),
+    array_aggregate("xs", lit(0.0), function(x, y) otherwise(when(x > y, x), y)) ==
+      expr("aggregate(xs, CAST(0.0 AS double), (x, y) -> CASE WHEN x > y THEN x ELSE y END)"),
+    array_aggregate(
+      "xs",
+      struct(
+        alias(lit(0.0), "count"),
+        alias(lit(0.0), "sum")
+      ),
+      function(acc, x) {
+        count <- getItem(acc, "count")
+        sum <- getItem(acc, "sum")
+        struct(alias(count + 1.0, "count"), alias(sum + x, "sum"))
+      },
+      function(acc) getItem(acc, "sum") / getItem(acc, "count")
+    ) == expr(paste0(
+      "aggregate(xs, struct(CAST(0.0 AS double) count, CAST(0.0 AS double) sum), ",
+      "(acc, x) -> ",
+      "struct(cast(acc.count + 1.0 AS double) count, CAST(acc.sum + x AS double) sum), ",
+      "acc -> acc.sum / acc.count)"
+    )),
+    arrays_zip_with("xs", "ys", function(x, y) x + y) ==
+      expr("zip_with(xs, ys, (x, y) -> x + y)"),
+    map_to_sorted_array(transform_keys("mx", function(k, v) upper(k))) ==
+      map_to_sorted_array(expr("transform_keys(mx, (k, v) -> upper(k))")),
+    map_to_sorted_array(transform_values("mx", function(k, v) v * 2)) ==
+      map_to_sorted_array(expr("transform_values(mx, (k, v) -> v * 2)")),
+    map_to_sorted_array(map_filter(column("my"), function(k, v) lower(v) != "foo")) ==
+      map_to_sorted_array(expr("map_filter(my, (k, v) -> lower(v) != 'foo')")),
+    map_to_sorted_array(map_zip_with("mx", "my", function(k, vx, vy) vx * vy)) ==
+      map_to_sorted_array(expr("map_zip_with(mx, my, (k, vx, vy) -> vx * vy)"))
+  ))
+
+  expect_true(all(unlist(result)))
+
+  expect_error(array_transform("xs", function(...) 42))
+})
+
 test_that("group by, agg functions", {
   df <- read.json(jsonPath)
   df1 <- agg(df, name = "max", age = "sum")

From 7de33f56e8d6fe7c540a1da01b9d6b382991f4ff Mon Sep 17 00:00:00 2001
From: zero323 <mszymkiewicz@gmail.com>
Date: Fri, 28 Feb 2020 12:59:39 +0900
Subject: [PATCH 175/185] [SPARK-30681][PYSPARK][SQL] Add higher order
 functions API to PySpark

### What changes were proposed in this pull request?

This PR add Python API for invoking following higher functions:

- `transform`
- `exists`
- `forall`
- `filter`
- `aggregate`
- `zip_with`
- `transform_keys`
- `transform_values`
- `map_filter`
- `map_zip_with`

to `pyspark.sql`. Each of these accepts plain Python functions of one of the following types

- `(Column) -> Column: ...`
- `(Column, Column) -> Column: ...`
- `(Column, Column, Column) -> Column: ...`

Internally this proposal piggbacks on objects supporting Scala implementation ([SPARK-27297](https://issues.apache.org/jira/browse/SPARK-27297)) by:

1. Creating  required `UnresolvedNamedLambdaVariables`  exposing these as PySpark `Columns`
2. Invoking Python function with these columns as arguments.
3. Using the result, and underlying JVM objects from 1., to create `expressions.LambdaFunction` which is passed to desired expression, and repacked as Python `Column`.

### Why are the changes needed?

Currently higher order functions are available only using SQL and Scala API and can use only SQL expressions

```python
df.selectExpr("transform(values, x -> x + 1)")
```

This works reasonably well for simple functions, but can get really ugly with complex functions (complex functions, casts), resulting objects are somewhat verbose and we don't get any IDE support.  Additionally DSL used, though  very simple, is not documented.

With changes propose here, above query could be rewritten as:

```python
df.select(transform("values", lambda x: x + 1))
```

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

- For positive cases this PR adds doctest strings covering possible usage patterns.
- For negative cases (unsupported function types) this PR adds unit tests.

### Notes

If approved, the same approach can be used in SparkR.

Closes #27406 from zero323/SPARK-30681.

Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/sql/functions.py            | 457 +++++++++++++++++++++
 python/pyspark/sql/tests/test_functions.py |  23 ++
 2 files changed, 480 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 5bf8165af6a44..424edbd24ddf3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2843,6 +2843,463 @@ def from_csv(col, schema, options={}):
     return Column(jc)
 
 
+def _unresolved_named_lambda_variable(*name_parts):
+    """
+    Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,
+    convert it to o.s.sql.Column and wrap in Python `Column`
+
+    :param name_parts: str
+    """
+    sc = SparkContext._active_spark_context
+    name_parts_seq = _to_seq(sc, name_parts)
+    expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
+    return Column(
+        sc._jvm.Column(
+            expressions.UnresolvedNamedLambdaVariable(name_parts_seq)
+        )
+    )
+
+
+def _get_lambda_parameters(f):
+    import inspect
+
+    signature = inspect.signature(f)
+    parameters = signature.parameters.values()
+
+    # We should exclude functions that use
+    # variable args and keyword argnames
+    # as well as keyword only args
+    supported_parmeter_types = {
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.POSITIONAL_ONLY,
+    }
+
+    # Validate that
+    # function arity is between 1 and 3
+    if not (1 <= len(parameters) <= 3):
+        raise ValueError(
+            "f should take between 1 and 3 arguments, but provided function takes {}".format(
+                len(parameters)
+            )
+        )
+
+    # and all arguments can be used as positional
+    if not all(p.kind in supported_parmeter_types for p in parameters):
+        raise ValueError(
+            "f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments"
+        )
+
+    return parameters
+
+
+def _get_lambda_parameters_legacy(f):
+    # TODO (SPARK-29909) Remove once 2.7 support is dropped
+    import inspect
+
+    spec = inspect.getargspec(f)
+    if not 1 <= len(spec.args) <= 3 or spec.varargs or spec.keywords:
+        raise ValueError(
+            "f should take between 1 and 3 arguments, but provided function takes {}".format(
+                spec
+            )
+        )
+    return spec.args
+
+
+def _create_lambda(f):
+    """
+    Create `o.a.s.sql.expressions.LambdaFunction` corresponding
+    to transformation described by f
+
+    :param f: A Python of one of the following forms:
+            - (Column) -> Column: ...
+            - (Column, Column) -> Column: ...
+            - (Column, Column, Column) -> Column: ...
+    """
+    if sys.version_info >= (3, 3):
+        parameters = _get_lambda_parameters(f)
+    else:
+        parameters = _get_lambda_parameters_legacy(f)
+
+    sc = SparkContext._active_spark_context
+    expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
+
+    argnames = ["x", "y", "z"]
+    args = [
+        _unresolved_named_lambda_variable(arg) for arg in argnames[: len(parameters)]
+    ]
+
+    result = f(*args)
+
+    if not isinstance(result, Column):
+        raise ValueError("f should return Column, got {}".format(type(result)))
+
+    jexpr = result._jc.expr()
+    jargs = _to_seq(sc, [arg._jc.expr() for arg in args])
+
+    return expressions.LambdaFunction(jexpr, jargs, False)
+
+
+def _invoke_higher_order_function(name, cols, funs):
+    """
+    Invokes expression identified by name,
+    (relative to ```org.apache.spark.sql.catalyst.expressions``)
+    and wraps the result with Column (first Scala one, then Python).
+
+    :param name: Name of the expression
+    :param cols: a list of columns
+    :param funs: a list of((*Column) -> Column functions.
+
+    :return: a Column
+    """
+    sc = SparkContext._active_spark_context
+    expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
+    expr = getattr(expressions, name)
+
+    jcols = [_to_java_column(col).expr() for col in cols]
+    jfuns = [_create_lambda(f) for f in funs]
+
+    return Column(sc._jvm.Column(expr(*jcols + jfuns)))
+
+
+@since(3.1)
+def transform(col, f):
+    """
+    Returns an array of elements after applying a transformation to each element in the input array.
+
+    :param col: name of column or expression
+    :param f: a function that is applied to each element of the input array.
+        Can take one of the following forms:
+
+        - Unary ``(x: Column) -> Column: ...``
+        - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is
+            a 0-based index of the element.
+
+        and can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values"))
+    >>> df.select(transform("values", lambda x: x * 2).alias("doubled")).show()
+    +------------+
+    |     doubled|
+    +------------+
+    |[2, 4, 6, 8]|
+    +------------+
+
+    >>> def alternate(x, i):
+    ...     return when(i % 2 == 0, x).otherwise(-x)
+    >>> df.select(transform("values", alternate).alias("alternated")).show()
+    +--------------+
+    |    alternated|
+    +--------------+
+    |[1, -2, 3, -4]|
+    +--------------+
+    """
+    return _invoke_higher_order_function("ArrayTransform", [col], [f])
+
+
+@since(3.1)
+def exists(col, f):
+    """
+    Returns whether a predicate holds for one or more elements in the array.
+
+    :param col: name of column or expression
+    :param f: an function ``(x: Column) -> Column: ...``  returning the Boolean expression.
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values"))
+    >>> df.select(exists("values", lambda x: x < 0).alias("any_negative")).show()
+    +------------+
+    |any_negative|
+    +------------+
+    |       false|
+    |        true|
+    +------------+
+    """
+    return _invoke_higher_order_function("ArrayExists", [col], [f])
+
+
+@since(3.1)
+def forall(col, f):
+    """
+    Returns whether a predicate holds for every element in the array.
+
+    :param col: name of column or expression
+    :param f: an function ``(x: Column) -> Column: ...``  returning the Boolean expression.
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame(
+    ...     [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])],
+    ...     ("key", "values")
+    ... )
+    >>> df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show()
+    +-------+
+    |all_foo|
+    +-------+
+    |  false|
+    |  false|
+    |   true|
+    +-------+
+    """
+    return _invoke_higher_order_function("ArrayForAll", [col], [f])
+
+
+@since(3.1)
+def filter(col, f):
+    """
+    Returns an array of elements for which a predicate holds in a given array.
+
+    :param col: name of column or expression
+    :param f: A function that returns the Boolean expression.
+        Can take one of the following forms:
+
+        - Unary ``(x: Column) -> Column: ...``
+        - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is
+            a 0-based index of the element.
+
+        and can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame(
+    ...     [(1, ["2018-09-20",  "2019-02-03", "2019-07-01", "2020-06-01"])],
+    ...     ("key", "values")
+    ... )
+    >>> def after_second_quarter(x):
+    ...     return month(to_date(x)) > 6
+    >>> df.select(
+    ...     filter("values", after_second_quarter).alias("after_second_quarter")
+    ... ).show(truncate=False)
+    +------------------------+
+    |after_second_quarter    |
+    +------------------------+
+    |[2018-09-20, 2019-07-01]|
+    +------------------------+
+    """
+    return _invoke_higher_order_function("ArrayFilter", [col], [f])
+
+
+@since(3.1)
+def aggregate(col, zero, merge, finish=None):
+    """
+    Applies a binary operator to an initial state and all elements in the array,
+    and reduces this to a single state. The final state is converted into the final result
+    by applying a finish function.
+
+    Both functions can use methods of :class:`pyspark.sql.Column`, functions defined in
+    :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+    Python ``UserDefinedFunctions`` are not supported
+    (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+
+    :param col: name of column or expression
+    :param zero: initial value. Name of column or expression
+    :param merge: a binary function ``(acc: Column, x: Column) -> Column...`` returning expression
+        of the same type as ``zero``
+    :param finish: an optional unary function ``(x: Column) -> Column: ...``
+        used to convert accumulated value.
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))
+    >>> df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()
+    +----+
+    | sum|
+    +----+
+    |42.0|
+    +----+
+
+    >>> def merge(acc, x):
+    ...     count = acc.count + 1
+    ...     sum = acc.sum + x
+    ...     return struct(count.alias("count"), sum.alias("sum"))
+    >>> df.select(
+    ...     aggregate(
+    ...         "values",
+    ...         struct(lit(0).alias("count"), lit(0.0).alias("sum")),
+    ...         merge,
+    ...         lambda acc: acc.sum / acc.count,
+    ...     ).alias("mean")
+    ... ).show()
+    +----+
+    |mean|
+    +----+
+    | 8.4|
+    +----+
+    """
+    if finish is not None:
+        return _invoke_higher_order_function(
+            "ArrayAggregate",
+            [col, zero],
+            [merge, finish]
+        )
+
+    else:
+        return _invoke_higher_order_function(
+            "ArrayAggregate",
+            [col, zero],
+            [merge]
+        )
+
+
+@since(3.1)
+def zip_with(col1, col2, f):
+    """
+    Merge two given arrays, element-wise, into a single array using a function.
+    If one array is shorter, nulls are appended at the end to match the length of the longer
+    array, before applying the function.
+
+    :param col1: name of the first column or expression
+    :param col2: name of the second column or expression
+    :param f: a binary function ``(x1: Column, x2: Column) -> Column...``
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys"))
+    >>> df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False)
+    +---------------------------+
+    |powers                     |
+    +---------------------------+
+    |[1.0, 9.0, 625.0, 262144.0]|
+    +---------------------------+
+
+    >>> df = spark.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys"))
+    >>> df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show()
+    +-----------------+
+    |            xs_ys|
+    +-----------------+
+    |[foo_1, bar_2, 3]|
+    +-----------------+
+    """
+    return _invoke_higher_order_function("ZipWith", [col1, col2], [f])
+
+
+@since(3.1)
+def transform_keys(col, f):
+    """
+    Applies a function to every key-value pair in a map and returns
+    a map with the results of those applications as the new keys for the pairs.
+
+    :param col: name of column or expression
+    :param f: a binary function ``(k: Column, v: Column) -> Column...``
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data"))
+    >>> df.select(transform_keys(
+    ...     "data", lambda k, _: upper(k)).alias("data_upper")
+    ... ).show(truncate=False)
+    +-------------------------+
+    |data_upper               |
+    +-------------------------+
+    |[BAR -> 2.0, FOO -> -2.0]|
+    +-------------------------+
+    """
+    return _invoke_higher_order_function("TransformKeys", [col], [f])
+
+
+@since(3.1)
+def transform_values(col, f):
+    """
+    Applies a function to every key-value pair in a map and returns
+    a map with the results of those applications as the new values for the pairs.
+
+    :param col: name of column or expression
+    :param f: a binary function ``(k: Column, v: Column) -> Column...``
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data"))
+    >>> df.select(transform_values(
+    ...     "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v)
+    ... ).alias("new_data")).show(truncate=False)
+    +---------------------------------------+
+    |new_data                               |
+    +---------------------------------------+
+    |[OPS -> 34.0, IT -> 20.0, SALES -> 2.0]|
+    +---------------------------------------+
+    """
+    return _invoke_higher_order_function("TransformValues", [col], [f])
+
+
+@since(3.1)
+def map_filter(col, f):
+    """
+    Returns a map whose key-value pairs satisfy a predicate.
+
+    :param col: name of column or expression
+    :param f: a binary function ``(k: Column, v: Column) -> Column...``
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data"))
+    >>> df.select(map_filter(
+    ...     "data", lambda _, v: v > 30.0).alias("data_filtered")
+    ... ).show(truncate=False)
+    +--------------------------+
+    |data_filtered             |
+    +--------------------------+
+    |[baz -> 32.0, foo -> 42.0]|
+    +--------------------------+
+    """
+    return _invoke_higher_order_function("MapFilter", [col], [f])
+
+
+@since(3.1)
+def map_zip_with(col1, col2, f):
+    """
+    Merge two given maps, key-wise into a single map using a function.
+
+    :param col1: name of the first column or expression
+    :param col2: name of the second column or expression
+    :param f: a ternary function ``(k: Column, v1: Column, v2: Column) -> Column...``
+        Can use methods of :class:`pyspark.sql.Column`, functions defined in
+        :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.
+        Python ``UserDefinedFunctions`` are not supported
+        (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).
+    :return: a :class:`pyspark.sql.Column`
+
+    >>> df = spark.createDataFrame([
+    ...     (1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})],
+    ...     ("id", "base", "ratio")
+    ... )
+    >>> df.select(map_zip_with(
+    ...     "base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias("updated_data")
+    ... ).show(truncate=False)
+    +---------------------------+
+    |updated_data               |
+    +---------------------------+
+    |[SALES -> 16.8, IT -> 48.0]|
+    +---------------------------+
+    """
+    return _invoke_higher_order_function("MapZipWith", [col1, col2], [f])
+
+
 # ---------------------------- User Defined Function ----------------------------------
 
 @since(1.3)
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index fa9ee57ff5f90..ddb8283cafa85 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -337,6 +337,29 @@ def test_overlay(self):
 
         self.assertListEqual(actual, expected)
 
+    def test_higher_order_function_failures(self):
+        from pyspark.sql.functions import col, exists, transform
+
+        # Should fail with varargs
+        with self.assertRaises(ValueError):
+            transform(col("foo"), lambda *x: lit(1))
+
+        # Should fail with kwargs
+        with self.assertRaises(ValueError):
+            transform(col("foo"), lambda **x: lit(1))
+
+        # Should fail with nullary function
+        with self.assertRaises(ValueError):
+            transform(col("foo"), lambda: lit(1))
+
+        # Should fail with quaternary function
+        with self.assertRaises(ValueError):
+            transform(col("foo"), lambda x1, x2, x3, x4: lit(1))
+
+        # Should fail if function doesn't return Column
+        with self.assertRaises(ValueError):
+            transform(col("foo"), lambda x: 1)
+
 
 if __name__ == "__main__":
     import unittest

From f21894e5faacb13d8019df1e8fcca4253f68d0ab Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 28 Feb 2020 15:14:23 +0900
Subject: [PATCH 176/185] [SPARK-30902][SQL] Default table provider should be
 decided by catalog implementations

### What changes were proposed in this pull request?

When `CREATE TABLE` SQL statement does not specify the provider, leave it to the catalog implementations to decide.

### Why are the changes needed?

It's super weird if we set the default provider to parquet when creating a table in a JDBC catalog.

### Does this PR introduce any user-facing change?

Yes, v2 catalog will not see a "provider" property in table properties if it's not specified in `CREATE TABLE` SQL statement. V2 catalog is new in 3.0.

### How was this patch tested?

new tests

Closes #27650 from cloud-fan/create_table.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../catalyst/analysis/ResolveCatalogs.scala   |  4 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  4 +-
 .../catalyst/plans/logical/statements.scala   |  4 +-
 .../sql/connector/catalog/CatalogV2Util.scala |  7 +-
 .../sql/catalyst/parser/DDLParserSuite.scala  | 64 +++++++++----------
 .../analysis/ResolveSessionCatalog.scala      | 18 +++---
 .../sql/connector/DataSourceV2SQLSuite.scala  | 34 ++++++++++
 .../execution/command/DDLParserSuite.scala    |  4 +-
 8 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
index 96558410d4004..a3135d7c9ed30 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala
@@ -174,7 +174,7 @@ class ResolveCatalogs(val catalogManager: CatalogManager)
         c.tableSchema,
         // convert the bucket spec and add it as a transform
         c.partitioning ++ c.bucketSpec.map(_.asTransform),
-        convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+        convertTableProperties(c.properties, c.options, c.location, c.comment, Some(c.provider)),
         orCreate = c.orCreate)
 
     case c @ ReplaceTableAsSelectStatement(
@@ -185,7 +185,7 @@ class ResolveCatalogs(val catalogManager: CatalogManager)
         // convert the bucket spec and add it as a transform
         c.partitioning ++ c.bucketSpec.map(_.asTransform),
         c.asSelect,
-        convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+        convertTableProperties(c.properties, c.options, c.location, c.comment, Some(c.provider)),
         writeOptions = c.options,
         orCreate = c.orCreate)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 30b278b7c52ca..774f145ee8605 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2763,9 +2763,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       operationNotAllowed("CREATE EXTERNAL TABLE ...", ctx)
     }
     val schema = Option(ctx.colTypeList()).map(createSchema)
-    val defaultProvider = conf.defaultDataSourceName
-    val provider =
-      Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse(defaultProvider)
+    val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText)
     val (partitioning, bucketSpec, properties, options, location, comment) =
       visitCreateTableClauses(ctx.createTableClauses())
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
index 6731214d3842d..b7bf5f64e0a80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
@@ -64,7 +64,7 @@ case class CreateTableStatement(
     partitioning: Seq[Transform],
     bucketSpec: Option[BucketSpec],
     properties: Map[String, String],
-    provider: String,
+    provider: Option[String],
     options: Map[String, String],
     location: Option[String],
     comment: Option[String],
@@ -79,7 +79,7 @@ case class CreateTableAsSelectStatement(
     partitioning: Seq[Transform],
     bucketSpec: Option[BucketSpec],
     properties: Map[String, String],
-    provider: String,
+    provider: Option[String],
     options: Map[String, String],
     location: Option[String],
     comment: Option[String],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala
index 0fabe4df6c9a4..ff63201b02b64 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala
@@ -298,10 +298,9 @@ private[sql] object CatalogV2Util {
       options: Map[String, String],
       location: Option[String],
       comment: Option[String],
-      provider: String): Map[String, String] = {
-    properties ++
-      options ++
-      Map(TableCatalog.PROP_PROVIDER -> provider) ++
+      provider: Option[String]): Map[String, String] = {
+    properties ++ options ++
+      provider.map(TableCatalog.PROP_PROVIDER -> _) ++
       comment.map(TableCatalog.PROP_COMMENT -> _) ++
       location.map(TableCatalog.PROP_LOCATION -> _)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
index 29c9c1a44ef4c..c4f19dae80ec0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -49,26 +49,6 @@ class DDLParserSuite extends AnalysisTest {
     comparePlans(parsePlan(sql), expected, checkAnalysis = false)
   }
 
-  test("SPARK-30098: create table without provider should " +
-    "use default data source under non-legacy mode") {
-    val createSql = "CREATE TABLE my_tab(a INT COMMENT 'test', b STRING)"
-    val defaultProvider = conf.defaultDataSourceName
-    val expectedPlan = CreateTableStatement(
-      Seq("my_tab"),
-      new StructType()
-        .add("a", IntegerType, nullable = true, "test")
-        .add("b", StringType),
-      Seq.empty[Transform],
-      None,
-      Map.empty[String, String],
-      defaultProvider,
-      Map.empty[String, String],
-      None,
-      None,
-      false)
-    parseCompare(createSql, expectedPlan)
-  }
-
   test("create/replace table using - schema") {
     val createSql = "CREATE TABLE my_tab(a INT COMMENT 'test', b STRING NOT NULL) USING parquet"
     val replaceSql = "REPLACE TABLE my_tab(a INT COMMENT 'test', b STRING NOT NULL) USING parquet"
@@ -80,7 +60,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq.empty[Transform],
       None,
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -103,7 +83,7 @@ class DDLParserSuite extends AnalysisTest {
         Seq.empty[Transform],
         None,
         Map.empty[String, String],
-        "parquet",
+        Some("parquet"),
         Map.empty[String, String],
         None,
         None),
@@ -123,7 +103,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq(IdentityTransform(FieldReference("a"))),
       None,
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -177,7 +157,7 @@ class DDLParserSuite extends AnalysisTest {
           LiteralValue(34, IntegerType)))),
       None,
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -199,7 +179,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq.empty[Transform],
       Some(BucketSpec(5, Seq("a"), Seq("b"))),
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -217,7 +197,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq.empty[Transform],
       None,
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       Some("abc"))
@@ -237,7 +217,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq.empty[Transform],
       None,
       Map("test" -> "test"),
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -255,7 +235,7 @@ class DDLParserSuite extends AnalysisTest {
         Seq.empty[Transform],
         None,
         Map.empty[String, String],
-        "parquet",
+        Some("parquet"),
         Map.empty[String, String],
         Some("/tmp/file"),
         None)
@@ -273,7 +253,7 @@ class DDLParserSuite extends AnalysisTest {
       Seq.empty[Transform],
       None,
       Map.empty[String, String],
-      "parquet",
+      Some("parquet"),
       Map.empty[String, String],
       None,
       None)
@@ -334,7 +314,7 @@ class DDLParserSuite extends AnalysisTest {
           Seq.empty[Transform],
           Option.empty[BucketSpec],
           Map.empty[String, String],
-          "json",
+          Some("json"),
           Map("a" -> "1", "b" -> "0.1", "c" -> "true"),
           None,
           None),
@@ -389,7 +369,7 @@ class DDLParserSuite extends AnalysisTest {
         Seq.empty[Transform],
         None,
         Map("p1" -> "v1", "p2" -> "v2"),
-        "parquet",
+        Some("parquet"),
         Map.empty[String, String],
         Some("/user/external/page_view"),
         Some("This is the staging page view table"))
@@ -2105,7 +2085,7 @@ class DDLParserSuite extends AnalysisTest {
       partitioning: Seq[Transform],
       bucketSpec: Option[BucketSpec],
       properties: Map[String, String],
-      provider: String,
+      provider: Option[String],
       options: Map[String, String],
       location: Option[String],
       comment: Option[String])
@@ -2131,7 +2111,7 @@ class DDLParserSuite extends AnalysisTest {
             replace.partitioning,
             replace.bucketSpec,
             replace.properties,
-            replace.provider,
+            Some(replace.provider),
             replace.options,
             replace.location,
             replace.comment)
@@ -2153,7 +2133,7 @@ class DDLParserSuite extends AnalysisTest {
             rtas.partitioning,
             rtas.bucketSpec,
             rtas.properties,
-            rtas.provider,
+            Some(rtas.provider),
             rtas.options,
             rtas.location,
             rtas.comment)
@@ -2181,4 +2161,20 @@ class DDLParserSuite extends AnalysisTest {
       parsePlan("COMMENT ON TABLE a.b.c IS 'xYz'"),
       CommentOnTable(UnresolvedTable(Seq("a", "b", "c")), "xYz"))
   }
+
+  test("create table - without using") {
+    val sql = "CREATE TABLE 1m.2g(a INT)"
+    val expectedTableSpec = TableSpec(
+      Seq("1m", "2g"),
+      Some(new StructType().add("a", IntegerType)),
+      Seq.empty[Transform],
+      None,
+      Map.empty[String, String],
+      None,
+      Map.empty[String, String],
+      None,
+      None)
+
+    testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
index 95dc95b14192d..4c3a0a9b4a700 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -258,9 +258,10 @@ class ResolveSessionCatalog(
     // session catalog and the table provider is not v2.
     case c @ CreateTableStatement(
          SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) =>
-      if (!isV2Provider(c.provider)) {
+      val provider = c.provider.getOrElse(conf.defaultDataSourceName)
+      if (!isV2Provider(provider)) {
         val tableDesc = buildCatalogTable(tbl.asTableIdentifier, c.tableSchema,
-          c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location,
+          c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location,
           c.comment, c.ifNotExists)
         val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists
         CreateTable(tableDesc, mode, None)
@@ -271,15 +272,16 @@ class ResolveSessionCatalog(
           c.tableSchema,
           // convert the bucket spec and add it as a transform
           c.partitioning ++ c.bucketSpec.map(_.asTransform),
-          convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+          convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)),
           ignoreIfExists = c.ifNotExists)
       }
 
     case c @ CreateTableAsSelectStatement(
          SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) =>
-      if (!isV2Provider(c.provider)) {
+      val provider = c.provider.getOrElse(conf.defaultDataSourceName)
+      if (!isV2Provider(provider)) {
         val tableDesc = buildCatalogTable(tbl.asTableIdentifier, new StructType,
-          c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location,
+          c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location,
           c.comment, c.ifNotExists)
         val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists
         CreateTable(tableDesc, mode, Some(c.asSelect))
@@ -290,7 +292,7 @@ class ResolveSessionCatalog(
           // convert the bucket spec and add it as a transform
           c.partitioning ++ c.bucketSpec.map(_.asTransform),
           c.asSelect,
-          convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+          convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)),
           writeOptions = c.options,
           ignoreIfExists = c.ifNotExists)
       }
@@ -312,7 +314,7 @@ class ResolveSessionCatalog(
           c.tableSchema,
           // convert the bucket spec and add it as a transform
           c.partitioning ++ c.bucketSpec.map(_.asTransform),
-          convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+          convertTableProperties(c.properties, c.options, c.location, c.comment, Some(c.provider)),
           orCreate = c.orCreate)
       }
 
@@ -327,7 +329,7 @@ class ResolveSessionCatalog(
           // convert the bucket spec and add it as a transform
           c.partitioning ++ c.bucketSpec.map(_.asTransform),
           c.asSelect,
-          convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider),
+          convertTableProperties(c.properties, c.options, c.location, c.comment, Some(c.provider)),
           writeOptions = c.options,
           orCreate = c.orCreate)
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 4ff20939b3dfd..c074b3352e015 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -256,6 +256,23 @@ class DataSourceV2SQLSuite
     checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty)
   }
 
+  test("CreateTable: without USING clause") {
+    // unset this config to use the default v2 session catalog.
+    spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
+    val testCatalog = catalog("testcat").asTableCatalog
+
+    sql("CREATE TABLE testcat.t1 (id int)")
+    val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1"))
+    // Spark shouldn't set the default provider for catalog plugins.
+    assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER))
+
+    sql("CREATE TABLE t2 (id int)")
+    val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog
+      .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table]
+    // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog.
+    assert(t2.v1Table.provider == Some(conf.defaultDataSourceName))
+  }
+
   test("CreateTable/RepalceTable: invalid schema if has interval type") {
     Seq("CREATE", "REPLACE").foreach { action =>
       val e1 = intercept[AnalysisException](
@@ -595,6 +612,23 @@ class DataSourceV2SQLSuite
     }
   }
 
+  test("CreateTableAsSelect: without USING clause") {
+    // unset this config to use the default v2 session catalog.
+    spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key)
+    val testCatalog = catalog("testcat").asTableCatalog
+
+    sql("CREATE TABLE testcat.t1 AS SELECT 1 i")
+    val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1"))
+    // Spark shouldn't set the default provider for catalog plugins.
+    assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER))
+
+    sql("CREATE TABLE t2 AS SELECT 1 i")
+    val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog
+      .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table]
+    // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog.
+    assert(t2.v1Table.provider == Some(conf.defaultDataSourceName))
+  }
+
   test("DropTable: basic") {
     val tableName = "testcat.ns1.ns2.tbl"
     val ident = Identifier.of(Array("ns1", "ns2"), "tbl")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index 81965e4c6c353..bacd64efedc83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -492,7 +492,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession {
     assert(statement.partitioning.isEmpty)
     assert(statement.bucketSpec.isEmpty)
     assert(statement.properties.isEmpty)
-    assert(statement.provider == conf.defaultDataSourceName)
+    assert(statement.provider.isEmpty)
     assert(statement.options.isEmpty)
     assert(statement.location.isEmpty)
     assert(statement.comment.isEmpty)
@@ -662,7 +662,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession {
       assert(state.partitioning.isEmpty)
       assert(state.bucketSpec.isEmpty)
       assert(state.properties.isEmpty)
-      assert(state.provider == conf.defaultDataSourceName)
+      assert(state.provider.isEmpty)
       assert(state.options.isEmpty)
       assert(state.location.isEmpty)
       assert(state.comment.isEmpty)

From c0d4cc3657b9dcca1d57d8776ec36b9efd84a06f Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Fri, 28 Feb 2020 15:18:25 +0900
Subject: [PATCH 177/185] [MINOR][SQL] Remove unnecessary MiMa excludes

### What changes were proposed in this pull request?
I found a few unnecessary MiMa excludes when auditing SQL binary incompatible changes.

### Why are the changes needed?
These MiMa excludes are not required any more, so remove.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually tested

Closes #27729 from huaxingao/mima.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 project/MimaExcludes.scala | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index d4bce82dd4b96..ccb545d422b13 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -216,14 +216,6 @@ object MimaExcludes {
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setMinInstancesPerNode"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setNumTrees"),
 
-    // [SPARK-26124] Update plugins, including MiMa
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsPushDownRequiredColumns.build"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics.fullSchema"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics.planInputPartitions"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsReportPartitioning.fullSchema"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsReportPartitioning.planInputPartitions"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.sql.sources.v2.reader.SupportsPushDownFilters.build"),
-
     // [SPARK-26090] Resolve most miscellaneous deprecation and build warnings for Spark 3
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.stat.test.BinarySampleBeanInfo"),
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.regression.LabeledPointBeanInfo"),

From 1383bd459a834fb075c5b570338fab0886110df9 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Fri, 28 Feb 2020 00:01:20 -0800
Subject: [PATCH 178/185] [SPARK-30970][K8S][CORE] Fix NPE while resolving k8s
 master url

### What changes were proposed in this pull request?

```
bin/spark-sql --master  k8s:///https://kubernetes.docker.internal:6443 --conf spark.kubernetes.container.image=yaooqinn/spark:v2.4.4
Exception in thread "main" java.lang.NullPointerException
	at org.apache.spark.util.Utils$.checkAndGetK8sMasterUrl(Utils.scala:2739)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:261)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:774)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
```
Althrough `k8s:///https://kubernetes.docker.internal:6443` is a wrong master url but should not throw npe
The `case null` will never be touched.
https://github.com/apache/spark/blob/3f4060c340d6bac412e8819c4388ccba226efcf3/core/src/main/scala/org/apache/spark/util/Utils.scala#L2772-L2776

### Why are the changes needed?

bug fix

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

add ut case

Closes #27721 from yaooqinn/SPARK-30970.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../main/scala/org/apache/spark/util/Utils.scala  | 15 ++++++---------
 .../scala/org/apache/spark/util/UtilsSuite.scala  |  4 ++++
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 297cc5e4cb100..dde43232f0d65 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2772,19 +2772,16 @@ private[spark] object Utils extends Logging {
     }
 
     val masterScheme = new URI(masterWithoutK8sPrefix).getScheme
-    val resolvedURL = masterScheme.toLowerCase(Locale.ROOT) match {
-      case "https" =>
+
+    val resolvedURL = Option(masterScheme).map(_.toLowerCase(Locale.ROOT)) match {
+      case Some("https") =>
         masterWithoutK8sPrefix
-      case "http" =>
+      case Some("http") =>
         logWarning("Kubernetes master URL uses HTTP instead of HTTPS.")
         masterWithoutK8sPrefix
-      case null =>
-        val resolvedURL = s"https://$masterWithoutK8sPrefix"
-        logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " +
-          s"URL is $resolvedURL.")
-        resolvedURL
       case _ =>
-        throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme)
+        throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme
+          + " found in URL: " + masterWithoutK8sPrefix)
     }
 
     s"k8s://$resolvedURL"
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 8f8902e497d49..f5e438b0f1a52 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1243,6 +1243,10 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     intercept[IllegalArgumentException] {
       Utils.checkAndGetK8sMasterUrl("k8s://foo://host:port")
     }
+
+    intercept[IllegalArgumentException] {
+      Utils.checkAndGetK8sMasterUrl("k8s:///https://host:port")
+    }
   }
 
   test("stringHalfWidth") {

From 14bb639c55f143b37255597581a18df05b3fc2c6 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Fri, 28 Feb 2020 16:55:24 +0800
Subject: [PATCH 179/185] [SPARK-30938][ML][MLLIB] BinaryClassificationMetrics
 optimization

### What changes were proposed in this pull request?
1, avoid `Iterator.grouped(size: Int)`, which need to maintain an arraybuffer of `size`
2, keep the number of partitions in curve computation

### Why are the changes needed?
1, `BinaryClassificationMetrics` tend to fail (OOM) when `grouping=count/numBins` is too large, due to `Iterator.grouped(size: Int)` need to maintain an arraybuffer with `size` entries, however, in `BinaryClassificationMetrics` we do not need to maintain such a big array;
2, make sizes of partitions more even;

This PR computes metrics more stable and a littler faster;

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
existing testsuites

Closes #27682 from zhengruifeng/grouped_opt.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
---
 .../BinaryClassificationMetrics.scala         | 71 +++++++++++++------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index f4e2040569f48..5e40de9a26615 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.evaluation
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.mllib.evaluation.binary._
-import org.apache.spark.rdd.{RDD, UnionRDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 
 /**
@@ -101,10 +101,19 @@ class BinaryClassificationMetrics @Since("3.0.0") (
   @Since("1.0.0")
   def roc(): RDD[(Double, Double)] = {
     val rocCurve = createCurve(FalsePositiveRate, Recall)
-    val sc = confusions.context
-    val first = sc.makeRDD(Seq((0.0, 0.0)), 1)
-    val last = sc.makeRDD(Seq((1.0, 1.0)), 1)
-    new UnionRDD[(Double, Double)](sc, Seq(first, rocCurve, last))
+    val numParts = rocCurve.getNumPartitions
+    rocCurve.mapPartitionsWithIndex { case (pid, iter) =>
+      if (numParts == 1) {
+        require(pid == 0)
+        Iterator.single((0.0, 0.0)) ++ iter ++ Iterator.single((1.0, 1.0))
+      } else if (pid == 0) {
+        Iterator.single((0.0, 0.0)) ++ iter
+      } else if (pid == numParts - 1) {
+        iter ++ Iterator.single((1.0, 1.0))
+      } else {
+        iter
+      }
+    }
   }
 
   /**
@@ -124,7 +133,13 @@ class BinaryClassificationMetrics @Since("3.0.0") (
   def pr(): RDD[(Double, Double)] = {
     val prCurve = createCurve(Recall, Precision)
     val (_, firstPrecision) = prCurve.first()
-    confusions.context.parallelize(Seq((0.0, firstPrecision)), 1).union(prCurve)
+    prCurve.mapPartitionsWithIndex { case (pid, iter) =>
+      if (pid == 0) {
+        Iterator.single((0.0, firstPrecision)) ++ iter
+      } else {
+        iter
+      }
+    }
   }
 
   /**
@@ -182,28 +197,40 @@ class BinaryClassificationMetrics @Since("3.0.0") (
         val countsSize = counts.count()
         // Group the iterator into chunks of about countsSize / numBins points,
         // so that the resulting number of bins is about numBins
-        var grouping = countsSize / numBins
+        val grouping = countsSize / numBins
         if (grouping < 2) {
           // numBins was more than half of the size; no real point in down-sampling to bins
           logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful")
           counts
         } else {
-          if (grouping >= Int.MaxValue) {
-            logWarning(
-              s"Curve too large ($countsSize) for $numBins bins; capping at ${Int.MaxValue}")
-            grouping = Int.MaxValue
+          counts.mapPartitions { iter =>
+            if (iter.hasNext) {
+              var score = Double.NaN
+              var agg = new BinaryLabelCounter()
+              var cnt = 0L
+              iter.flatMap { pair =>
+                score = pair._1
+                agg += pair._2
+                cnt += 1
+                if (cnt == grouping) {
+                  // The score of the combined point will be just the last one's score,
+                  // which is also the minimal in each chunk since all scores are already
+                  // sorted in descending.
+                  // The combined point will contain all counts in this chunk. Thus, calculated
+                  // metrics (like precision, recall, etc.) on its score (or so-called threshold)
+                  // are the same as those without sampling.
+                  val ret = (score, agg)
+                  agg = new BinaryLabelCounter()
+                  cnt = 0
+                  Some(ret)
+                } else None
+              } ++ {
+                if (cnt > 0) {
+                  Iterator.single((score, agg))
+                } else Iterator.empty
+              }
+            } else Iterator.empty
           }
-          counts.mapPartitions(_.grouped(grouping.toInt).map { pairs =>
-            // The score of the combined point will be just the last one's score, which is also
-            // the minimal in each chunk since all scores are already sorted in descending.
-            val lastScore = pairs.last._1
-            // The combined point will contain all counts in this chunk. Thus, calculated
-            // metrics (like precision, recall, etc.) on its score (or so-called threshold) are
-            // the same as those without sampling.
-            val agg = new BinaryLabelCounter()
-            pairs.foreach(pair => agg += pair._2)
-            (lastScore, agg)
-          })
         }
       }
 

From eba2076ca32b0b497c1f65ad9d7c6297591c7e9e Mon Sep 17 00:00:00 2001
From: Eric Wu <492960551@qq.com>
Date: Fri, 28 Feb 2020 18:42:15 +0800
Subject: [PATCH 180/185] [SPARK-30842][SQL] Adjust abstraction structure for
 join operators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
Currently the join operators are not well abstracted, since there are lot of common logic. A trait can be created for easier pattern matching and other future handiness. This is a follow-up PR based on comment
https://github.com/apache/spark/pull/27509#discussion_r379613391 .

This PR refined from the following aspects:
1. Refined structure of all physical join operators
2. Add missing joinType field for CartesianProductExec operator
3. Refined codes related to Explain Formatted

The EXPLAIN FORMATTED changes are
1. Converge all join operator `verboseStringWithOperatorId` implementations to `BaseJoinExec`. Join condition displayed, and join keys displayed if it’s not empty.
2. `#1` will add Join condition to `BroadcastNestedLoopJoinExec`.
3. `#1` will **NOT** affect `CartesianProductExec`,`SortMergeJoin` and `HashJoin`s, since they already got there override implementation before.
4. Converge all join operator `simpleStringWithNodeId` to `BaseJoinExec`, which will enhance the one line description for `CartesianProductExec` with `JoinType` added.
5. Override `simpleStringWithNodeId` in `BroadcastNestedLoopJoinExec` to show `BuildSide`, which was only done for `HashJoin`s before.

### Why are the changes needed?
Make the code consistent with other operators and for future handiness of join operators.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Existing tests

Closes #27595 from Eric5553/RefineJoin.

Authored-by: Eric Wu <492960551@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/joins/BaseJoinExec.scala    | 56 +++++++++++++++++++
 .../joins/BroadcastHashJoinExec.scala         |  4 +-
 .../joins/BroadcastNestedLoopJoinExec.scala   | 12 +++-
 .../joins/CartesianProductExec.scala          | 21 +++----
 .../spark/sql/execution/joins/HashJoin.scala  | 25 +--------
 .../joins/ShuffledHashJoinExec.scala          |  4 +-
 .../execution/joins/SortMergeJoinExec.scala   | 19 +------
 7 files changed, 81 insertions(+), 60 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BaseJoinExec.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BaseJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BaseJoinExec.scala
new file mode 100644
index 0000000000000..86b31eb0d0c7e
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BaseJoinExec.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.JoinType
+import org.apache.spark.sql.execution.{BinaryExecNode, ExplainUtils}
+
+/**
+ * Holds common logic for join operators
+ */
+trait BaseJoinExec extends BinaryExecNode {
+  def joinType: JoinType
+  def condition: Option[Expression]
+  def leftKeys: Seq[Expression]
+  def rightKeys: Seq[Expression]
+
+  override def simpleStringWithNodeId(): String = {
+    val opId = ExplainUtils.getOpId(this)
+    s"$nodeName $joinType ($opId)".trim
+  }
+
+  override def verboseStringWithOperatorId(): String = {
+    val joinCondStr = if (condition.isDefined) {
+      s"${condition.get}"
+    } else "None"
+    if (leftKeys.nonEmpty || rightKeys.nonEmpty) {
+      s"""
+         |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
+         |${ExplainUtils.generateFieldString("Left keys", leftKeys)}
+         |${ExplainUtils.generateFieldString("Right keys", rightKeys)}
+         |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
+       """.stripMargin
+    } else {
+      s"""
+         |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
+         |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
+       """.stripMargin
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index fd4a7897c7ad1..08128d8f69dab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, UnspecifiedDistribution}
-import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
+import org.apache.spark.sql.execution.{CodegenSupport, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.{BooleanType, LongType}
 
@@ -44,7 +44,7 @@ case class BroadcastHashJoinExec(
     condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan)
-  extends BinaryExecNode with HashJoin with CodegenSupport {
+  extends HashJoin with CodegenSupport {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
index 5517c0dcdb188..888e7af7c07ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
+import org.apache.spark.sql.execution.{ExplainUtils, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.collection.{BitSet, CompactBuffer}
 
@@ -32,7 +32,10 @@ case class BroadcastNestedLoopJoinExec(
     right: SparkPlan,
     buildSide: BuildSide,
     joinType: JoinType,
-    condition: Option[Expression]) extends BinaryExecNode {
+    condition: Option[Expression]) extends BaseJoinExec {
+
+  override def leftKeys: Seq[Expression] = Nil
+  override def rightKeys: Seq[Expression] = Nil
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
@@ -43,6 +46,11 @@ case class BroadcastNestedLoopJoinExec(
     case BuildLeft => (right, left)
   }
 
+  override def simpleStringWithNodeId(): String = {
+    val opId = ExplainUtils.getOpId(this)
+    s"$nodeName $joinType ${buildSide} ($opId)".trim
+  }
+
   override def requiredChildDistribution: Seq[Distribution] = buildSide match {
     case BuildLeft =>
       BroadcastDistribution(IdentityBroadcastMode) :: UnspecifiedDistribution :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
index 7e2f487fdcc5d..a71bf94c45034 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -22,7 +22,8 @@ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Predicate, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
-import org.apache.spark.sql.execution.{BinaryExecNode, ExplainUtils, ExternalAppendOnlyUnsafeRowArray, SparkPlan}
+import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
+import org.apache.spark.sql.execution.{ExternalAppendOnlyUnsafeRowArray, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.CompletionIterator
 
@@ -60,23 +61,17 @@ class UnsafeCartesianRDD(
 case class CartesianProductExec(
     left: SparkPlan,
     right: SparkPlan,
-    condition: Option[Expression]) extends BinaryExecNode {
+    condition: Option[Expression]) extends BaseJoinExec {
+
+  override def joinType: JoinType = Inner
+  override def leftKeys: Seq[Expression] = Nil
+  override def rightKeys: Seq[Expression] = Nil
+
   override def output: Seq[Attribute] = left.output ++ right.output
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
-  override def verboseStringWithOperatorId(): String = {
-    val joinCondStr = if (condition.isDefined) {
-      s"${condition.get}"
-    } else "None"
-
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
-     """.stripMargin
-  }
-
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index f4796c194cb4f..7f90a51c1f234 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -22,39 +22,18 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
-import org.apache.spark.sql.execution.{ExplainUtils, RowIterator, SparkPlan}
+import org.apache.spark.sql.execution.{ExplainUtils, RowIterator}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.{IntegralType, LongType}
 
-trait HashJoin {
-  self: SparkPlan =>
-
-  def leftKeys: Seq[Expression]
-  def rightKeys: Seq[Expression]
-  def joinType: JoinType
+trait HashJoin extends BaseJoinExec {
   def buildSide: BuildSide
-  def condition: Option[Expression]
-  def left: SparkPlan
-  def right: SparkPlan
 
   override def simpleStringWithNodeId(): String = {
     val opId = ExplainUtils.getOpId(this)
     s"$nodeName $joinType ${buildSide} ($opId)".trim
   }
 
-  override def verboseStringWithOperatorId(): String = {
-    val joinCondStr = if (condition.isDefined) {
-      s"${condition.get}"
-    } else "None"
-
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |${ExplainUtils.generateFieldString("Left keys", leftKeys)}
-       |${ExplainUtils.generateFieldString("Right keys", rightKeys)}
-       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
-     """.stripMargin
-  }
-
   override def output: Seq[Attribute] = {
     joinType match {
       case _: InnerLike =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index a8361fd7dd559..755a63e545ef1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
@@ -39,7 +39,7 @@ case class ShuffledHashJoinExec(
     condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan)
-  extends BinaryExecNode with HashJoin {
+  extends HashJoin {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 7a08dd1afd3a6..2c57956de5bca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -41,7 +41,7 @@ case class SortMergeJoinExec(
     condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan,
-    isSkewJoin: Boolean = false) extends BinaryExecNode with CodegenSupport {
+    isSkewJoin: Boolean = false) extends BaseJoinExec with CodegenSupport {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
@@ -52,23 +52,6 @@ case class SortMergeJoinExec(
 
   override def stringArgs: Iterator[Any] = super.stringArgs.toSeq.dropRight(1).iterator
 
-  override def simpleStringWithNodeId(): String = {
-    val opId = ExplainUtils.getOpId(this)
-    s"$nodeName $joinType ($opId)".trim
-  }
-
-  override def verboseStringWithOperatorId(): String = {
-    val joinCondStr = if (condition.isDefined) {
-      s"${condition.get}"
-    } else "None"
-    s"""
-       |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)}
-       |${ExplainUtils.generateFieldString("Left keys", leftKeys)}
-       |${ExplainUtils.generateFieldString("Right keys", rightKeys)}
-       |${ExplainUtils.generateFieldString("Join condition", joinCondStr)}
-     """.stripMargin
-  }
-
   override def output: Seq[Attribute] = {
     joinType match {
       case _: InnerLike =>

From a40a2f83383337ee1e4628b2a0b64d7c67ef0f89 Mon Sep 17 00:00:00 2001
From: iRakson <raksonrakesh@gmail.com>
Date: Fri, 28 Feb 2020 22:57:50 +0800
Subject: [PATCH 181/185] [SPARK-27619][SQL][FOLLOWUP] Rename
 'spark.sql.legacy.useHashOnMapType' to 'spark.sql.legacy.allowHashOnMapType'

### What changes were proposed in this pull request?
Renamed configuration from `spark.sql.legacy.useHashOnMapType` to `spark.sql.legacy.allowHashOnMapType`.

### Why are the changes needed?
Better readability of configuration.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Existing UTs.

Closes #27719 from iRakson/SPARK-27619_FOLLOWUP.

Authored-by: iRakson <raksonrakesh@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 docs/sql-migration-guide.md                                 | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/hash.scala    | 6 +++---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 5 +++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index d241a664357f6..fb8f866c1b5b8 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -224,7 +224,7 @@ license: |
 
   - Since Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces (<= ASCII 32) will be trimmed before converted to these type values, e.g. `cast(' 1\t' as int)` results `1`, `cast(' 1\t' as boolean)` results `true`, `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, while casting string to integrals and booleans, it will not trim the whitespaces from both ends, the foregoing results will be `null`, while to datetimes, only the trailing spaces (= ASCII 32) will be removed.
 
-  - Since Spark 3.0, An analysis exception will be thrown when hash expressions are applied on elements of MapType. To restore the behavior before Spark 3.0, set `spark.sql.legacy.useHashOnMapType` to true.
+  - Since Spark 3.0, an analysis exception will be thrown when hash expressions are applied on elements of MapType. To restore the behavior before Spark 3.0, set `spark.sql.legacy.allowHashOnMapType` to `true`.
     
   - Since Spark 3.0, numbers written in scientific notation(e.g. `1E2`) would be parsed as Double. In Spark version 2.4 and earlier, they're parsed as Decimal. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.exponentLiteralAsDecimal.enabled` to `true`.
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 5be7f54107759..4c8c58ae232f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -256,11 +256,11 @@ abstract class HashExpression[E] extends Expression {
       TypeCheckResult.TypeCheckFailure(
         s"input to function $prettyName requires at least one argument")
     } else if (children.exists(child => hasMapType(child.dataType)) &&
-      !SQLConf.get.getConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE)) {
+        !SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE)) {
       TypeCheckResult.TypeCheckFailure(
         s"input to function $prettyName cannot contain elements of MapType. In Spark, same maps " +
-          "may have different hashcode, thus hash expressions are prohibited on MapType " +
-          s"elements. To restore previous behavior set ${SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key} " +
+          "may have different hashcode, thus hash expressions are prohibited on MapType elements." +
+          s" To restore previous behavior set ${SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE.key} " +
           "to true.")
     } else {
       TypeCheckResult.TypeCheckSuccess
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 22293a8ff8086..2840dee7aabae 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2258,7 +2258,7 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val LEGACY_USE_HASH_ON_MAPTYPE = buildConf("spark.sql.legacy.useHashOnMapType")
+  val LEGACY_ALLOW_HASH_ON_MAPTYPE = buildConf("spark.sql.legacy.allowHashOnMapType")
     .doc("When set to true, hash expressions can be applied on elements of MapType. Otherwise, " +
       "an analysis exception will be thrown.")
     .booleanConf
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 383861a448683..87de8f50568f6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2130,10 +2130,11 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
     }
   }
 
-  test("SPARK-27619: when spark.sql.legacy.useHashOnMapType is true, hash can be used on Maptype") {
+  test(s"SPARK-27619: When ${SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE.key} is true, hash can be " +
+    "used on Maptype") {
     Seq("hash", "xxhash64").foreach {
       case hashExpression =>
-        withSQLConf(SQLConf.LEGACY_USE_HASH_ON_MAPTYPE.key -> "true") {
+        withSQLConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE.key -> "true") {
           val df = spark.createDataset(Map() :: Nil)
           checkAnswer(df.selectExpr(s"$hashExpression(*)"), sql(s"SELECT $hashExpression(map())"))
         }

From 3b69796a897a67675b27e192ba9d6714106fdd81 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 28 Feb 2020 10:54:38 -0800
Subject: [PATCH 182/185] [SPARK-30947][CORE] Log better message when
 accelerate resource is empty

### What changes were proposed in this pull request?

Try to log better message when accelerate resource is empty.

### Why are the changes needed?

Otherwise, it's weird to see cpu/memory resources after logging **that** resources is empty:

```
20/02/25 21:47:55 INFO ResourceUtils: ==============================================================
20/02/25 21:47:55 INFO ResourceUtils: Resources for spark.driver:

20/02/25 21:47:55 INFO ResourceUtils: ==============================================================
20/02/25 21:47:55 INFO SparkContext: Submitted application: Spark shell
20/02/25 21:47:55 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
20/02/25 21:47:55 INFO ResourceProfile: Limiting resource is  at -1 tasks per executor
```

### Does this PR introduce any user-facing change?

NO.

### How was this patch tested?

Tested manually.

Closes #27693 from Ngone51/dont_log_resource.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Xingbo Jiang <xingbo.jiang@databricks.com>
---
 .../scala/org/apache/spark/resource/ResourceProfile.scala  | 4 +++-
 .../scala/org/apache/spark/resource/ResourceUtils.scala    | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
index 5b2476c18caea..844026d246e2c 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala
@@ -193,7 +193,9 @@ class ResourceProfile(
       throw new SparkException("No executor resource configs were not specified for the " +
         s"following task configs: ${taskResourcesToCheck.keys.mkString(",")}")
     }
-    logInfo(s"Limiting resource is $limitingResource at $taskLimit tasks per executor")
+    val limiting =
+      if (taskLimit == -1) "cpu" else s"$limitingResource at $taskLimit tasks per executor"
+    logInfo(s"Limiting resource is $limiting")
     _executorResourceSlotsPerAddr = Some(numPartsPerResourceMap.toMap)
     _maxTasksPerExecutor = if (taskLimit == -1) Some(1) else Some(taskLimit)
     _limitingResource = Some(limitingResource)
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
index cdb761c7566e7..22272557aa00f 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
@@ -362,8 +362,13 @@ private[spark] object ResourceUtils extends Logging {
 
   def logResourceInfo(componentName: String, resources: Map[String, ResourceInformation])
     : Unit = {
+    val resourceInfo = if (resources.isEmpty) {
+      s"No custom resources configured for $componentName."
+    } else {
+      s"Custom resources for $componentName:\n${resources.mkString("\n")}"
+    }
     logInfo("==============================================================")
-    logInfo(s"Resources for $componentName:\n${resources.mkString("\n")}")
+    logInfo(resourceInfo)
     logInfo("==============================================================")
   }
 

From 961c539a676d2646a9315b427ad81852aa81b658 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Fri, 28 Feb 2020 11:22:08 -0800
Subject: [PATCH 183/185] [SPARK-28998][SQL][FOLLOW-UP] Remove unnecessary MiMa
 excludes

### What changes were proposed in this pull request?
Remove the cases for ```MissingTypesProblem```, ```InheritedNewAbstractMethodProblem```, ```DirectMissingMethodProblem``` and ```ReversedMissingMethodProblem```.

### Why are the changes needed?
After the changes, we don't have ```org.apache.spark.sql.sources.v2```  any more, so the only problem we can get is ```MissingClassProblem```

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Manually tested

Closes #27731 from huaxingao/spark-28998-followup.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 project/MimaExcludes.scala | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index ccb545d422b13..cd55fa84f87f3 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -339,14 +339,6 @@ object MimaExcludes {
     (problem: Problem) => problem match {
       case MissingClassProblem(cls) =>
         !cls.fullName.startsWith("org.apache.spark.sql.sources.v2")
-      case MissingTypesProblem(newCls, _) =>
-        !newCls.fullName.startsWith("org.apache.spark.sql.sources.v2")
-      case InheritedNewAbstractMethodProblem(cls, _) =>
-        !cls.fullName.startsWith("org.apache.spark.sql.sources.v2")
-      case DirectMissingMethodProblem(meth) =>
-        !meth.owner.fullName.startsWith("org.apache.spark.sql.sources.v2")
-      case ReversedMissingMethodProblem(meth) =>
-        !meth.owner.fullName.startsWith("org.apache.spark.sql.sources.v2")
       case _ => true
     },
 

From 6c0c41fa0d1e119b16980405b5dc69b953380d7d Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@nvidia.com>
Date: Fri, 28 Feb 2020 11:43:05 -0800
Subject: [PATCH 184/185] [SPARK-30987][CORE] Increase the timeout on
 local-cluster waitUntilExecutorsUp calls

### What changes were proposed in this pull request?

The ResourceDiscoveryPlugin tests intermittently timeout. They are timing out on just bringing up the local-cluster. I am not able to reproduce locally.  I suspect the jenkins boxes are overloaded and taking longer then 10 seconds. There was another jira SPARK-29139 that increased timeout for some other of these as well. So try increasing the timeout to 60 seconds.

Examples of timeouts:
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/119030/testReport/
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/119005/testReport/
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/119029/testReport/

### Why are the changes needed?

tests should no longer intermittently fail.

### Does this PR introduce any user-facing change?

no
### How was this patch tested?

unit tests ran.

Closes #27738 from tgravescs/SPARK-30987.

Authored-by: Thomas Graves <tgraves@nvidia.com>
Signed-off-by: Xingbo Jiang <xingbo.jiang@databricks.com>
---
 .../test/scala/org/apache/spark/DistributedSuite.scala    | 2 +-
 .../spark/internal/plugin/PluginContainerSuite.scala      | 4 ++--
 .../spark/resource/ResourceDiscoveryPluginSuite.scala     | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 3f309819065be..4d157b9607000 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -174,7 +174,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
 
   private def testCaching(conf: SparkConf, storageLevel: StorageLevel): Unit = {
     sc = new SparkContext(conf.setMaster(clusterUrl).setAppName("test"))
-    TestUtils.waitUntilExecutorsUp(sc, 2, 30000)
+    TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
     val data = sc.parallelize(1 to 1000, 10)
     val cachedData = data.persist(storageLevel)
     assert(cachedData.count === 1000)
diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
index cf2d9293ef822..7888796dd55e6 100644
--- a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
@@ -139,7 +139,7 @@ class PluginContainerSuite extends SparkFunSuite with BeforeAndAfterEach with Lo
       .set(NonLocalModeSparkPlugin.TEST_PATH_CONF, path.getAbsolutePath())
 
     sc = new SparkContext(conf)
-    TestUtils.waitUntilExecutorsUp(sc, 2, 10000)
+    TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
     eventually(timeout(10.seconds), interval(100.millis)) {
       val children = path.listFiles()
@@ -169,7 +169,7 @@ class PluginContainerSuite extends SparkFunSuite with BeforeAndAfterEach with Lo
       sc = new SparkContext(conf)
 
       // Ensure all executors has started
-      TestUtils.waitUntilExecutorsUp(sc, 1, 10000)
+      TestUtils.waitUntilExecutorsUp(sc, 1, 60000)
 
       var children = Array.empty[File]
       eventually(timeout(10.seconds), interval(100.millis)) {
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala
index 7a05daa2ad715..437c903e77d4a 100644
--- a/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala
@@ -56,7 +56,7 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext
         .set(EXECUTOR_FPGA_ID.amountConf, "1")
 
       sc = new SparkContext(conf)
-      TestUtils.waitUntilExecutorsUp(sc, 2, 10000)
+      TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
       eventually(timeout(10.seconds), interval(100.millis)) {
         val children = dir.listFiles()
@@ -84,7 +84,7 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext
         .set(SPARK_RESOURCES_DIR, dir.getName())
 
       sc = new SparkContext(conf)
-      TestUtils.waitUntilExecutorsUp(sc, 2, 10000)
+      TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
       eventually(timeout(10.seconds), interval(100.millis)) {
         val children = dir.listFiles()
@@ -111,7 +111,7 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext
         .set(SPARK_RESOURCES_DIR, dir.getName())
 
       sc = new SparkContext(conf)
-      TestUtils.waitUntilExecutorsUp(sc, 2, 10000)
+      TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
       eventually(timeout(10.seconds), interval(100.millis)) {
         val children = dir.listFiles()
@@ -137,7 +137,7 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext
         .set(SPARK_RESOURCES_DIR, dir.getName())
 
       sc = new SparkContext(conf)
-      TestUtils.waitUntilExecutorsUp(sc, 2, 10000)
+      TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
       assert(sc.resources.size === 1)
       assert(sc.resources.get(GPU).get.addresses === Array("5", "6"))

From 0e2ca11d80c3921387d7b077cb64c3a0c06b08d7 Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@nvidia.com>
Date: Fri, 28 Feb 2020 15:23:33 -0600
Subject: [PATCH 185/185] [SPARK-29149][YARN] Update YARN cluster manager For
 Stage Level Scheduling

### What changes were proposed in this pull request?

Yarn side changes for Stage level scheduling.  The previous PR for dynamic allocation changes was https://github.com/apache/spark/pull/27313

Modified the data structures to store things on a per ResourceProfile basis.
 I tried to keep the code changes to a minimum, the main loop that requests just goes through each Resourceprofile and the logic inside for each one stayed very close to the same.
On submission we now have to give each ResourceProfile a separate yarn Priority because yarn doesn't support asking for containers with different resources at the same Priority. We just use the profile id as the priority level.
Using a different Priority actually makes things easier when the containers come back to match them again which ResourceProfile they were requested for.
The expectation is that yarn will only give you a container with resource amounts you requested or more. It should never give you a container if it doesn't satisfy your resource requests.

If you want to see the full feature changes you can look at https://github.com/apache/spark/pull/27053/files for reference

### Why are the changes needed?

For stage level scheduling YARN support.

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

Tested manually on YARN cluster and then unit tests.

Closes #27583 from tgravescs/SPARK-29149.

Authored-by: Thomas Graves <tgraves@nvidia.com>
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../cluster/CoarseGrainedClusterMessage.scala |   6 +-
 .../apache/spark/HeartbeatReceiverSuite.scala |  30 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |   9 +-
 .../deploy/yarn/ApplicationMasterSource.scala |   4 +-
 ...yPreferredContainerPlacementStrategy.scala |  25 +-
 .../deploy/yarn/ResourceRequestHelper.scala   |  11 +
 .../spark/deploy/yarn/YarnAllocator.scala     | 556 ++++++++++++------
 .../cluster/YarnSchedulerBackend.scala        |   8 +-
 .../ContainerPlacementStrategySuite.scala     |  46 +-
 .../yarn/LocalityPlacementStrategySuite.scala |   6 +-
 .../deploy/yarn/YarnAllocatorSuite.scala      | 258 +++++---
 .../cluster/YarnSchedulerBackendSuite.scala   |  20 +-
 12 files changed, 663 insertions(+), 316 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 8db0122f17ab4..465c0d20de481 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -117,9 +117,9 @@ private[spark] object CoarseGrainedClusterMessages {
   // Request executors by specifying the new total number of executors desired
   // This includes executors already pending or running
   case class RequestExecutors(
-      requestedTotal: Int,
-      localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int],
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int],
+      numLocalityAwareTasksPerResourceProfileId: Map[Int, Int],
+      hostToLocalTaskCount: Map[Int, Map[String, Int]],
       nodeBlacklist: Set[String])
     extends CoarseGrainedClusterMessage
 
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index a9296955d18b4..312691302b064 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -30,7 +30,7 @@ import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
 import org.apache.spark.internal.config.DYN_ALLOCATION_TESTING
-import org.apache.spark.resource.ResourceProfile
+import org.apache.spark.resource.{ResourceProfile, ResourceProfileManager}
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -61,6 +61,7 @@ class HeartbeatReceiverSuite
     PrivateMethod[collection.Map[String, Long]](Symbol("executorLastSeen"))
   private val _executorTimeoutMs = PrivateMethod[Long](Symbol("executorTimeoutMs"))
   private val _killExecutorThread = PrivateMethod[ExecutorService](Symbol("killExecutorThread"))
+  var conf: SparkConf = _
 
   /**
    * Before each test, set up the SparkContext and a custom [[HeartbeatReceiver]]
@@ -68,7 +69,7 @@ class HeartbeatReceiverSuite
    */
   override def beforeEach(): Unit = {
     super.beforeEach()
-    val conf = new SparkConf()
+    conf = new SparkConf()
       .setMaster("local[2]")
       .setAppName("test")
       .set(DYN_ALLOCATION_TESTING, true)
@@ -76,7 +77,6 @@ class HeartbeatReceiverSuite
     scheduler = mock(classOf[TaskSchedulerImpl])
     when(sc.taskScheduler).thenReturn(scheduler)
     when(scheduler.nodeBlacklist).thenReturn(Predef.Set[String]())
-    when(scheduler.resourcesReqsPerTask).thenReturn(Seq.empty)
     when(scheduler.sc).thenReturn(sc)
     heartbeatReceiverClock = new ManualClock
     heartbeatReceiver = new HeartbeatReceiver(sc, heartbeatReceiverClock)
@@ -164,9 +164,10 @@ class HeartbeatReceiverSuite
   test("expire dead hosts should kill executors with replacement (SPARK-8119)") {
     // Set up a fake backend and cluster manager to simulate killing executors
     val rpcEnv = sc.env.rpcEnv
-    val fakeClusterManager = new FakeClusterManager(rpcEnv)
+    val fakeClusterManager = new FakeClusterManager(rpcEnv, conf)
     val fakeClusterManagerRef = rpcEnv.setupEndpoint("fake-cm", fakeClusterManager)
-    val fakeSchedulerBackend = new FakeSchedulerBackend(scheduler, rpcEnv, fakeClusterManagerRef)
+    val fakeSchedulerBackend =
+      new FakeSchedulerBackend(scheduler, rpcEnv, fakeClusterManagerRef, sc.resourceProfileManager)
     when(sc.schedulerBackend).thenReturn(fakeSchedulerBackend)
 
     // Register fake executors with our fake scheduler backend
@@ -282,18 +283,16 @@ private class FakeExecutorEndpoint(override val rpcEnv: RpcEnv) extends RpcEndpo
 private class FakeSchedulerBackend(
     scheduler: TaskSchedulerImpl,
     rpcEnv: RpcEnv,
-    clusterManagerEndpoint: RpcEndpointRef)
+    clusterManagerEndpoint: RpcEndpointRef,
+    resourceProfileManager: ResourceProfileManager)
   extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) {
 
   protected override def doRequestTotalExecutors(
       resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = {
     clusterManagerEndpoint.ask[Boolean](
-      RequestExecutors(
-        resourceProfileToTotalExecs(ResourceProfile.getOrCreateDefaultProfile(conf)),
-        numLocalityAwareTasksPerResourceProfileId(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
-        rpHostToLocalTaskCount(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID),
-        Set.empty))
-  }
+      RequestExecutors(resourceProfileToTotalExecs, numLocalityAwareTasksPerResourceProfileId,
+        rpHostToLocalTaskCount, Set.empty))
+}
 
   protected override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
     clusterManagerEndpoint.ask[Boolean](KillExecutors(executorIds))
@@ -303,7 +302,7 @@ private class FakeSchedulerBackend(
 /**
  * Dummy cluster manager to simulate responses to executor allocation requests.
  */
-private class FakeClusterManager(override val rpcEnv: RpcEnv) extends RpcEndpoint {
+private class FakeClusterManager(override val rpcEnv: RpcEnv, conf: SparkConf) extends RpcEndpoint {
   private var targetNumExecutors = 0
   private val executorIdsToKill = new mutable.HashSet[String]
 
@@ -311,8 +310,9 @@ private class FakeClusterManager(override val rpcEnv: RpcEnv) extends RpcEndpoin
   def getExecutorIdsToKill: Set[String] = executorIdsToKill.toSet
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case RequestExecutors(requestedTotal, _, _, _) =>
-      targetNumExecutors = requestedTotal
+    case RequestExecutors(resourceProfileToTotalExecs, _, _, _) =>
+      targetNumExecutors =
+        resourceProfileToTotalExecs(ResourceProfile.getOrCreateDefaultProfile(conf))
       context.reply(true)
     case KillExecutors(executorIds) =>
       executorIdsToKill ++= executorIds
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 1e8f4084ef9c7..43cd7458ef55b 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -593,7 +593,7 @@ private[spark] class ApplicationMaster(
           }
       }
       try {
-        val numPendingAllocate = allocator.getPendingAllocate.size
+        val numPendingAllocate = allocator.getNumContainersPendingAllocate
         var sleepStartNs = 0L
         var sleepInterval = 200L // ms
         allocatorLock.synchronized {
@@ -778,8 +778,11 @@ private[spark] class ApplicationMaster(
       case r: RequestExecutors =>
         Option(allocator) match {
           case Some(a) =>
-            if (a.requestTotalExecutorsWithPreferredLocalities(r.requestedTotal,
-              r.localityAwareTasks, r.hostToLocalTaskCount, r.nodeBlacklist)) {
+            if (a.requestTotalExecutorsWithPreferredLocalities(
+              r.resourceProfileToTotalExecs,
+              r.numLocalityAwareTasksPerResourceProfileId,
+              r.hostToLocalTaskCount,
+              r.nodeBlacklist)) {
               resetAllocatorInterval()
             }
             context.reply(true)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterSource.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterSource.scala
index 0fec916582602..62ac17cff169a 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterSource.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterSource.scala
@@ -40,11 +40,11 @@ private[spark] class ApplicationMasterSource(prefix: String, yarnAllocator: Yarn
   })
 
   metricRegistry.register(MetricRegistry.name("numLocalityAwareTasks"), new Gauge[Int] {
-    override def getValue: Int = yarnAllocator.numLocalityAwareTasks
+    override def getValue: Int = yarnAllocator.getNumLocalityAwareTasks
   })
 
   metricRegistry.register(MetricRegistry.name("numContainersPendingAllocate"), new Gauge[Int] {
-    override def getValue: Int = yarnAllocator.numContainersPendingAllocate
+    override def getValue: Int = yarnAllocator.getNumContainersPendingAllocate
   })
 
 }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 2288bb55d8b47..a6380abbaa316 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 
 import org.apache.spark.SparkConf
-import org.apache.spark.internal.config._
+import org.apache.spark.resource.ResourceProfile
 
 private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], racks: Array[String])
 
@@ -82,7 +82,6 @@ private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], rack
 private[yarn] class LocalityPreferredContainerPlacementStrategy(
     val sparkConf: SparkConf,
     val yarnConf: Configuration,
-    val resource: Resource,
     resolver: SparkRackResolver) {
 
   /**
@@ -96,6 +95,7 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
    *                                     containers
    * @param localityMatchedPendingAllocations A sequence of pending container request which
    *                                          matches the localities of current required tasks.
+   * @param rp The ResourceProfile associated with this container.
    * @return node localities and rack localities, each locality is an array of string,
    *         the length of localities is the same as number of containers
    */
@@ -104,11 +104,12 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
       numLocalityAwareTasks: Int,
       hostToLocalTaskCount: Map[String, Int],
       allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
-      localityMatchedPendingAllocations: Seq[ContainerRequest]
+      localityMatchedPendingAllocations: Seq[ContainerRequest],
+      rp: ResourceProfile
     ): Array[ContainerLocalityPreferences] = {
     val updatedHostToContainerCount = expectedHostToContainerCount(
       numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap,
-        localityMatchedPendingAllocations)
+        localityMatchedPendingAllocations, rp)
     val updatedLocalityAwareContainerNum = updatedHostToContainerCount.values.sum
 
     // The number of containers to allocate, divided into two groups, one with preferred locality,
@@ -152,11 +153,14 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
   }
 
   /**
-   * Calculate the number of executors need to satisfy the given number of pending tasks.
+   * Calculate the number of executors needed to satisfy the given number of pending tasks for
+   * the ResourceProfile.
    */
-  private def numExecutorsPending(numTasksPending: Int): Int = {
-    val coresPerExecutor = resource.getVirtualCores
-    (numTasksPending * sparkConf.get(CPUS_PER_TASK) + coresPerExecutor - 1) / coresPerExecutor
+  private def numExecutorsPending(
+      numTasksPending: Int,
+      rp: ResourceProfile): Int = {
+    val tasksPerExec = rp.maxTasksPerExecutor(sparkConf)
+    math.ceil(numTasksPending / tasksPerExec.toDouble).toInt
   }
 
   /**
@@ -175,14 +179,15 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
       localityAwareTasks: Int,
       hostToLocalTaskCount: Map[String, Int],
       allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
-      localityMatchedPendingAllocations: Seq[ContainerRequest]
+      localityMatchedPendingAllocations: Seq[ContainerRequest],
+      rp: ResourceProfile
     ): Map[String, Int] = {
     val totalLocalTaskNum = hostToLocalTaskCount.values.sum
     val pendingHostToContainersMap = pendingHostToContainerCount(localityMatchedPendingAllocations)
 
     hostToLocalTaskCount.map { case (host, count) =>
       val expectedCount =
-        count.toDouble * numExecutorsPending(localityAwareTasks) / totalLocalTaskNum
+        count.toDouble * numExecutorsPending(localityAwareTasks, rp) / totalLocalTaskNum
       // Take the locality of pending containers into consideration
       val existedCount = allocatedHostToContainersMap.get(host).map(_.size).getOrElse(0) +
         pendingHostToContainersMap.getOrElse(host, 0.0)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala
index ae316b02eefd8..3d800be9e210b 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala
@@ -227,6 +227,17 @@ private object ResourceRequestHelper extends Logging {
     resourceInformation
   }
 
+  def isYarnCustomResourcesNonEmpty(resource: Resource): Boolean = {
+    try {
+      // Use reflection as this uses APIs only available in Hadoop 3
+      val getResourcesMethod = resource.getClass().getMethod("getResources")
+      val resources = getResourcesMethod.invoke(resource).asInstanceOf[Array[Any]]
+      if (resources.nonEmpty) true else false
+    } catch {
+      case  _: NoSuchMethodException => false
+    }
+  }
+
   /**
    * Checks whether Hadoop 2.x or 3 is used as a dependency.
    * In case of Hadoop 3 and later, the ResourceInformation class
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 09414cbbe50a4..cd0e7d5c87bc8 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.util.Collections
-import java.util.concurrent._
+import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicInteger
+import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -39,6 +39,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Python._
 import org.apache.spark.resource.ResourceProfile
+import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
 import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
@@ -75,19 +76,69 @@ private[yarn] class YarnAllocator(
   import YarnAllocator._
 
   // Visible for testing.
-  val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]]
+  @GuardedBy("this")
+  val allocatedHostToContainersMapPerRPId =
+    new HashMap[Int, HashMap[String, collection.mutable.Set[ContainerId]]]
+
+  @GuardedBy("this")
   val allocatedContainerToHostMap = new HashMap[ContainerId, String]
 
   // Containers that we no longer care about. We've either already told the RM to release them or
   // will on the next heartbeat. Containers get removed from this map after the RM tells us they've
   // completed.
-  private val releasedContainers = Collections.newSetFromMap[ContainerId](
-    new ConcurrentHashMap[ContainerId, java.lang.Boolean])
+  @GuardedBy("this")
+  private val releasedContainers = collection.mutable.HashSet[ContainerId]()
+
+  @GuardedBy("this")
+  private val runningExecutorsPerResourceProfileId = new HashMap[Int, mutable.Set[String]]()
 
-  private val runningExecutors = Collections.newSetFromMap[String](
-    new ConcurrentHashMap[String, java.lang.Boolean]())
+  @GuardedBy("this")
+  private val numExecutorsStartingPerResourceProfileId = new HashMap[Int, AtomicInteger]
 
-  private val numExecutorsStarting = new AtomicInteger(0)
+  @GuardedBy("this")
+  private val targetNumExecutorsPerResourceProfileId = new mutable.HashMap[Int, Int]
+
+  // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
+  // list of requesters that should be responded to once we find out why the given executor
+  // was lost.
+  @GuardedBy("this")
+  private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
+
+  // Maintain loss reasons for already released executors, it will be added when executor loss
+  // reason is got from AM-RM call, and be removed after querying this loss reason.
+  @GuardedBy("this")
+  private val releasedExecutorLossReasons = new HashMap[String, ExecutorLossReason]
+
+  // Keep track of which container is running which executor to remove the executors later
+  // Visible for testing.
+  @GuardedBy("this")
+  private[yarn] val executorIdToContainer = new HashMap[String, Container]
+
+  @GuardedBy("this")
+  private var numUnexpectedContainerRelease = 0L
+
+  @GuardedBy("this")
+  private val containerIdToExecutorIdAndResourceProfileId = new HashMap[ContainerId, (String, Int)]
+
+  // Use a ConcurrentHashMap because this is used in matchContainerToRequest, which is called
+  // from the rack resolver thread where synchronize(this) on this would cause a deadlock.
+  @GuardedBy("ConcurrentHashMap")
+  private[yarn] val rpIdToYarnResource = new ConcurrentHashMap[Int, Resource]()
+
+  // note currently we don't remove ResourceProfiles
+  @GuardedBy("this")
+  private[yarn] val rpIdToResourceProfile = new mutable.HashMap[Int, ResourceProfile]
+
+  // A map of ResourceProfile id to a map of preferred hostname and possible
+  // task numbers running on it.
+  @GuardedBy("this")
+  private var hostToLocalTaskCountPerResourceProfileId: Map[Int, Map[String, Int]] =
+    Map(DEFAULT_RESOURCE_PROFILE_ID -> Map.empty)
+
+  // ResourceProfile Id to number of tasks that have locality preferences in active stages
+  @GuardedBy("this")
+  private[yarn] var numLocalityAwareTasksPerResourceProfileId: Map[Int, Int] =
+    Map(DEFAULT_RESOURCE_PROFILE_ID -> 0)
 
   /**
    * Used to generate a unique ID per executor
@@ -102,6 +153,7 @@ private[yarn] class YarnAllocator(
    *
    * @see SPARK-12864
    */
+  @GuardedBy("this")
   private var executorIdCounter: Int =
     driverRef.askSync[Int](RetrieveLastAllocatedExecutorId)
 
@@ -110,26 +162,6 @@ private[yarn] class YarnAllocator(
   private val allocatorBlacklistTracker =
     new YarnAllocatorBlacklistTracker(sparkConf, amClient, failureTracker)
 
-  @volatile private var targetNumExecutors =
-    SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf)
-
-
-  // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
-  // list of requesters that should be responded to once we find out why the given executor
-  // was lost.
-  private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
-
-  // Maintain loss reasons for already released executors, it will be added when executor loss
-  // reason is got from AM-RM call, and be removed after querying this loss reason.
-  private val releasedExecutorLossReasons = new HashMap[String, ExecutorLossReason]
-
-  // Keep track of which container is running which executor to remove the executors later
-  // Visible for testing.
-  private[yarn] val executorIdToContainer = new HashMap[String, Container]
-
-  private var numUnexpectedContainerRelease = 0L
-  private val containerIdToExecutorId = new HashMap[ContainerId, String]
-
   // Executor memory in MiB.
   protected val executorMemory = sparkConf.get(EXECUTOR_MEMORY).toInt
   // Executor offHeap memory in MiB.
@@ -142,17 +174,18 @@ private[yarn] class YarnAllocator(
   } else {
     0
   }
-  // Number of cores per executor.
-  protected val executorCores = sparkConf.get(EXECUTOR_CORES)
+  // Number of cores per executor for the default profile
+  protected val defaultExecutorCores = sparkConf.get(EXECUTOR_CORES)
 
   private val executorResourceRequests =
     getYarnResourcesAndAmounts(sparkConf, config.YARN_EXECUTOR_RESOURCE_TYPES_PREFIX) ++
     getYarnResourcesFromSparkResources(SPARK_EXECUTOR_PREFIX, sparkConf)
 
-  // Resource capability requested for each executor
-  private[yarn] val resource: Resource = {
-    val resource = Resource.newInstance(
-      executorMemory + executorOffHeapMemory + memoryOverhead + pysparkWorkerMemory, executorCores)
+  // Resource capability requested for each executor for the default profile
+  private[yarn] val defaultResource: Resource = {
+    val resource: Resource = Resource.newInstance(
+      executorMemory + executorOffHeapMemory + memoryOverhead + pysparkWorkerMemory,
+      defaultExecutorCores)
     ResourceRequestHelper.setResourceRequests(executorResourceRequests, resource)
     logDebug(s"Created resource capability: $resource")
     resource
@@ -166,19 +199,42 @@ private[yarn] class YarnAllocator(
 
   private val labelExpression = sparkConf.get(EXECUTOR_NODE_LABEL_EXPRESSION)
 
-  // A map to store preferred hostname and possible task numbers running on it.
-  private var hostToLocalTaskCounts: Map[String, Int] = Map.empty
-
-  // Number of tasks that have locality preferences in active stages
-  private[yarn] var numLocalityAwareTasks: Int = 0
-
   // A container placement strategy based on pending tasks' locality preference
   private[yarn] val containerPlacementStrategy =
-    new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource, resolver)
+    new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resolver)
+
+  // The default profile is always present so we need to initialize the datastructures keyed by
+  // ResourceProfile id to ensure its present if things start running before a request for
+  // executors could add it. This approach is easier then going and special casing everywhere.
+  private def initDefaultProfile(): Unit = synchronized {
+    allocatedHostToContainersMapPerRPId(DEFAULT_RESOURCE_PROFILE_ID) =
+      new HashMap[String, mutable.Set[ContainerId]]()
+    runningExecutorsPerResourceProfileId.put(DEFAULT_RESOURCE_PROFILE_ID, mutable.HashSet[String]())
+    numExecutorsStartingPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID) = new AtomicInteger(0)
+    targetNumExecutorsPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID) =
+      SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf)
+    rpIdToYarnResource.put(DEFAULT_RESOURCE_PROFILE_ID, defaultResource)
+    rpIdToResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) =
+      ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+  }
+
+  initDefaultProfile()
 
-  def getNumExecutorsRunning: Int = runningExecutors.size()
+  def getNumExecutorsRunning: Int = synchronized {
+    runningExecutorsPerResourceProfileId.values.map(_.size).sum
+  }
+
+  def getNumLocalityAwareTasks: Int = synchronized {
+    numLocalityAwareTasksPerResourceProfileId.values.sum
+  }
 
-  def getNumReleasedContainers: Int = releasedContainers.size()
+  def getNumExecutorsStarting: Int = synchronized {
+    numExecutorsStartingPerResourceProfileId.values.map(_.get()).sum
+  }
+
+  def getNumReleasedContainers: Int = synchronized {
+    releasedContainers.size
+  }
 
   def getNumExecutorsFailed: Int = failureTracker.numFailedExecutors
 
@@ -186,49 +242,147 @@ private[yarn] class YarnAllocator(
 
   /**
    * A sequence of pending container requests that have not yet been fulfilled.
+   * ResourceProfile id -> pendingAllocate container request
    */
-  def getPendingAllocate: Seq[ContainerRequest] = getPendingAtLocation(ANY_HOST)
+  def getPendingAllocate: Map[Int, Seq[ContainerRequest]] = getPendingAtLocation(ANY_HOST)
 
-  def numContainersPendingAllocate: Int = synchronized {
-    getPendingAllocate.size
+  def getNumContainersPendingAllocate: Int = synchronized {
+    getPendingAllocate.values.flatten.size
+  }
+
+  // YARN priorities are such that lower number is higher priority.
+  // We need to allocate a different priority for each ResourceProfile because YARN
+  // won't allow different container resource requirements within a Priority.
+  // We could allocate per Stage to make sure earlier stages get priority but Spark
+  // always finishes a stage before starting a later one and if we have 2 running in parallel
+  // the priority doesn't matter.
+  // We are using the ResourceProfile id as the priority.
+  private def getContainerPriority(rpId: Int): Priority = {
+    Priority.newInstance(rpId)
+  }
+
+  // The ResourceProfile id is the priority
+  private def getResourceProfileIdFromPriority(priority: Priority): Int = {
+    priority.getPriority()
+  }
+
+  private def getOrUpdateAllocatedHostToContainersMapForRPId(
+      rpId: Int): HashMap[String, collection.mutable.Set[ContainerId]] = synchronized {
+    allocatedHostToContainersMapPerRPId.getOrElseUpdate(rpId,
+      new HashMap[String, mutable.Set[ContainerId]]())
+  }
+
+  private def getOrUpdateRunningExecutorForRPId(rpId: Int): mutable.Set[String] = synchronized {
+    runningExecutorsPerResourceProfileId.getOrElseUpdate(rpId, mutable.HashSet[String]())
+  }
+
+  private def getOrUpdateNumExecutorsStartingForRPId(rpId: Int): AtomicInteger = synchronized {
+    numExecutorsStartingPerResourceProfileId.getOrElseUpdate(rpId, new AtomicInteger(0))
+  }
+
+  private def getOrUpdateTargetNumExecutorsForRPId(rpId: Int): Int = synchronized {
+    targetNumExecutorsPerResourceProfileId.getOrElseUpdate(rpId,
+      SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf))
   }
 
   /**
-   * A sequence of pending container requests at the given location that have not yet been
-   * fulfilled.
+   * A sequence of pending container requests at the given location for each ResourceProfile id
+   * that have not yet been fulfilled.
    */
-  private def getPendingAtLocation(location: String): Seq[ContainerRequest] =
-    amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala
-      .flatMap(_.asScala)
+  private def getPendingAtLocation(
+      location: String): Map[Int, Seq[ContainerRequest]] = synchronized {
+    val allContainerRequests = new mutable.HashMap[Int, Seq[ContainerRequest]]
+    rpIdToResourceProfile.keys.map { id =>
+      val profResource = rpIdToYarnResource.get(id)
+      val result = amClient.getMatchingRequests(getContainerPriority(id), location, profResource)
+        .asScala.flatMap(_.asScala)
+      allContainerRequests(id) = result
+    }
+    allContainerRequests.toMap
+  }
+
+  // if a ResourceProfile hasn't been seen yet, create the corresponding YARN Resource for it
+  private def createYarnResourceForResourceProfile(
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Unit = synchronized {
+    resourceProfileToTotalExecs.foreach { case (rp, num) =>
+      if (!rpIdToYarnResource.contains(rp.id)) {
+        // Start with the application or default settings
+        var heapMem = executorMemory.toLong
+        // Note we currently don't support off heap memory in ResourceProfile - SPARK-30794
+        var offHeapMem = executorOffHeapMemory.toLong
+        var overheadMem = memoryOverhead.toLong
+        var pysparkMem = pysparkWorkerMemory.toLong
+        var cores = defaultExecutorCores
+        val customResources = new mutable.HashMap[String, String]
+        // track the resource profile if not already there
+        getOrUpdateRunningExecutorForRPId(rp.id)
+        logInfo(s"Resource profile ${rp.id} doesn't exist, adding it")
+        val execResources = rp.executorResources
+        execResources.foreach { case (r, execReq) =>
+          r match {
+            case ResourceProfile.MEMORY =>
+              heapMem = execReq.amount
+            case ResourceProfile.OVERHEAD_MEM =>
+              overheadMem = execReq.amount
+            case ResourceProfile.PYSPARK_MEM =>
+              pysparkMem = execReq.amount
+            case ResourceProfile.CORES =>
+              cores = execReq.amount.toInt
+            case "gpu" =>
+              customResources(YARN_GPU_RESOURCE_CONFIG) = execReq.amount.toString
+            case "fpga" =>
+              customResources(YARN_FPGA_RESOURCE_CONFIG) = execReq.amount.toString
+            case rName =>
+              customResources(rName) = execReq.amount.toString
+          }
+        }
+        val totalMem = (heapMem + offHeapMem + overheadMem + pysparkMem).toInt
+        val resource = Resource.newInstance(totalMem, cores)
+        ResourceRequestHelper.setResourceRequests(customResources.toMap, resource)
+        logDebug(s"Created resource capability: $resource")
+        rpIdToYarnResource.putIfAbsent(rp.id, resource)
+        rpIdToResourceProfile(rp.id) = rp
+      }
+    }
+  }
 
   /**
    * Request as many executors from the ResourceManager as needed to reach the desired total. If
    * the requested total is smaller than the current number of running executors, no executors will
    * be killed.
-   * @param requestedTotal total number of containers requested
-   * @param localityAwareTasks number of locality aware tasks to be used as container placement hint
-   * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
-   *                             container placement hint.
+   * @param resourceProfileToTotalExecs total number of containers requested for each
+   *                                    ResourceProfile
+   * @param numLocalityAwareTasksPerResourceProfileId number of locality aware tasks for each
+   *                                                  ResourceProfile id to be used as container
+   *                                                  placement hint.
+   * @param hostToLocalTaskCount a map of preferred hostname to possible task counts for each
+   *                             ResourceProfile id to be used as container placement hint.
    * @param nodeBlacklist blacklisted nodes, which is passed in to avoid allocating new containers
    *                      on them. It will be used to update the application master's blacklist.
    * @return Whether the new requested total is different than the old value.
    */
   def requestTotalExecutorsWithPreferredLocalities(
-      requestedTotal: Int,
-      localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int],
+      resourceProfileToTotalExecs: Map[ResourceProfile, Int],
+      numLocalityAwareTasksPerResourceProfileId: Map[Int, Int],
+      hostToLocalTaskCountPerResourceProfileId: Map[Int, Map[String, Int]],
       nodeBlacklist: Set[String]): Boolean = synchronized {
-    this.numLocalityAwareTasks = localityAwareTasks
-    this.hostToLocalTaskCounts = hostToLocalTaskCount
-
-    if (requestedTotal != targetNumExecutors) {
-      logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
-      targetNumExecutors = requestedTotal
-      allocatorBlacklistTracker.setSchedulerBlacklistedNodes(nodeBlacklist)
-      true
-    } else {
-      false
+    this.numLocalityAwareTasksPerResourceProfileId = numLocalityAwareTasksPerResourceProfileId
+    this.hostToLocalTaskCountPerResourceProfileId = hostToLocalTaskCountPerResourceProfileId
+
+    createYarnResourceForResourceProfile(resourceProfileToTotalExecs)
+
+    val res = resourceProfileToTotalExecs.map { case (rp, numExecs) =>
+      if (numExecs != getOrUpdateTargetNumExecutorsForRPId(rp.id)) {
+        logInfo(s"Driver requested a total number of $numExecs executor(s) " +
+          s"for resource profile id: ${rp.id}.")
+        targetNumExecutorsPerResourceProfileId(rp.id) = numExecs
+        allocatorBlacklistTracker.setSchedulerBlacklistedNodes(nodeBlacklist)
+        true
+      } else {
+        false
+      }
     }
+    res.exists(_ == true)
   }
 
   /**
@@ -237,8 +391,9 @@ private[yarn] class YarnAllocator(
   def killExecutor(executorId: String): Unit = synchronized {
     executorIdToContainer.get(executorId) match {
       case Some(container) if !releasedContainers.contains(container.getId) =>
+        val (_, rpId) = containerIdToExecutorIdAndResourceProfileId(container.getId)
         internalReleaseContainer(container)
-        runningExecutors.remove(executorId)
+        getOrUpdateRunningExecutorForRPId(rpId).remove(executorId)
       case _ => logWarning(s"Attempted to kill unknown executor $executorId!")
     }
   }
@@ -267,8 +422,8 @@ private[yarn] class YarnAllocator(
         "Launching executor count: %d. Cluster resources: %s.")
         .format(
           allocatedContainers.size,
-          runningExecutors.size,
-          numExecutorsStarting.get,
+          getNumExecutorsRunning,
+          getNumExecutorsStarting,
           allocateResponse.getAvailableResources))
 
       handleAllocatedContainers(allocatedContainers.asScala)
@@ -279,108 +434,122 @@ private[yarn] class YarnAllocator(
       logDebug("Completed %d containers".format(completedContainers.size))
       processCompletedContainers(completedContainers.asScala)
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
-        .format(completedContainers.size, runningExecutors.size))
+        .format(completedContainers.size, getNumExecutorsRunning))
     }
   }
 
   /**
    * Update the set of container requests that we will sync with the RM based on the number of
-   * executors we have currently running and our target number of executors.
+   * executors we have currently running and our target number of executors for each
+   * ResourceProfile.
    *
    * Visible for testing.
    */
-  def updateResourceRequests(): Unit = {
-    val pendingAllocate = getPendingAllocate
-    val numPendingAllocate = pendingAllocate.size
-    val missing = targetNumExecutors - numPendingAllocate -
-      numExecutorsStarting.get - runningExecutors.size
-    logDebug(s"Updating resource requests, target: $targetNumExecutors, " +
-      s"pending: $numPendingAllocate, running: ${runningExecutors.size}, " +
-      s"executorsStarting: ${numExecutorsStarting.get}")
-
-    // Split the pending container request into three groups: locality matched list, locality
-    // unmatched list and non-locality list. Take the locality matched container request into
-    // consideration of container placement, treat as allocated containers.
-    // For locality unmatched and locality free container requests, cancel these container
-    // requests, since required locality preference has been changed, recalculating using
-    // container placement strategy.
-    val (localRequests, staleRequests, anyHostRequests) = splitPendingAllocationsByLocality(
-      hostToLocalTaskCounts, pendingAllocate)
-
-    if (missing > 0) {
-      if (log.isInfoEnabled()) {
-        var requestContainerMessage = s"Will request $missing executor container(s), each with " +
+  def updateResourceRequests(): Unit = synchronized {
+    val pendingAllocatePerResourceProfileId = getPendingAllocate
+    val missingPerProfile = targetNumExecutorsPerResourceProfileId.map { case (rpId, targetNum) =>
+      val starting = getOrUpdateNumExecutorsStartingForRPId(rpId).get
+      val pending = pendingAllocatePerResourceProfileId.getOrElse(rpId, Seq.empty).size
+      val running = getOrUpdateRunningExecutorForRPId(rpId).size
+      logDebug(s"Updating resource requests for ResourceProfile id: $rpId, target: " +
+        s"$targetNum, pending: $pending, running: $running, executorsStarting: $starting")
+      (rpId, targetNum - pending - running - starting)
+    }.toMap
+
+    missingPerProfile.foreach { case (rpId, missing) =>
+      val hostToLocalTaskCount =
+        hostToLocalTaskCountPerResourceProfileId.getOrElse(rpId, Map.empty)
+      val pendingAllocate = pendingAllocatePerResourceProfileId.getOrElse(rpId, Seq.empty)
+      val numPendingAllocate = pendingAllocate.size
+      // Split the pending container request into three groups: locality matched list, locality
+      // unmatched list and non-locality list. Take the locality matched container request into
+      // consideration of container placement, treat as allocated containers.
+      // For locality unmatched and locality free container requests, cancel these container
+      // requests, since required locality preference has been changed, recalculating using
+      // container placement strategy.
+      val (localRequests, staleRequests, anyHostRequests) = splitPendingAllocationsByLocality(
+        hostToLocalTaskCount, pendingAllocate)
+
+      if (missing > 0) {
+        val resource = rpIdToYarnResource.get(rpId)
+        if (log.isInfoEnabled()) {
+          var requestContainerMessage = s"Will request $missing executor container(s) for " +
+            s" ResourceProfile Id: $rpId, each with " +
             s"${resource.getVirtualCores} core(s) and " +
             s"${resource.getMemory} MB memory (including $memoryOverhead MB of overhead)"
-        if (ResourceRequestHelper.isYarnResourceTypesAvailable() &&
-            executorResourceRequests.nonEmpty) {
-          requestContainerMessage ++= s" with custom resources: " + resource.toString
+          if (ResourceRequestHelper.isYarnResourceTypesAvailable() &&
+            ResourceRequestHelper.isYarnCustomResourcesNonEmpty(resource)) {
+            requestContainerMessage ++= s" with custom resources: " + resource.toString
+          }
+          logInfo(requestContainerMessage)
         }
-        logInfo(requestContainerMessage)
-      }
-
-      // cancel "stale" requests for locations that are no longer needed
-      staleRequests.foreach { stale =>
-        amClient.removeContainerRequest(stale)
-      }
-      val cancelledContainers = staleRequests.size
-      if (cancelledContainers > 0) {
-        logInfo(s"Canceled $cancelledContainers container request(s) (locality no longer needed)")
-      }
 
-      // consider the number of new containers and cancelled stale containers available
-      val availableContainers = missing + cancelledContainers
+        // cancel "stale" requests for locations that are no longer needed
+        staleRequests.foreach { stale =>
+          amClient.removeContainerRequest(stale)
+        }
+        val cancelledContainers = staleRequests.size
+        if (cancelledContainers > 0) {
+          logInfo(s"Canceled $cancelledContainers container request(s) (locality no longer needed)")
+        }
 
-      // to maximize locality, include requests with no locality preference that can be cancelled
-      val potentialContainers = availableContainers + anyHostRequests.size
+        // consider the number of new containers and cancelled stale containers available
+        val availableContainers = missing + cancelledContainers
 
-      val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
-        potentialContainers, numLocalityAwareTasks, hostToLocalTaskCounts,
-          allocatedHostToContainersMap, localRequests)
+        // to maximize locality, include requests with no locality preference that can be cancelled
+        val potentialContainers = availableContainers + anyHostRequests.size
 
-      val newLocalityRequests = new mutable.ArrayBuffer[ContainerRequest]
-      containerLocalityPreferences.foreach {
-        case ContainerLocalityPreferences(nodes, racks) if nodes != null =>
-          newLocalityRequests += createContainerRequest(resource, nodes, racks)
-        case _ =>
-      }
+        val allocatedHostToContainer = getOrUpdateAllocatedHostToContainersMapForRPId(rpId)
+        val numLocalityAwareTasks = numLocalityAwareTasksPerResourceProfileId.getOrElse(rpId, 0)
+        val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
+          potentialContainers, numLocalityAwareTasks, hostToLocalTaskCount,
+          allocatedHostToContainer, localRequests, rpIdToResourceProfile(rpId))
 
-      if (availableContainers >= newLocalityRequests.size) {
-        // more containers are available than needed for locality, fill in requests for any host
-        for (i <- 0 until (availableContainers - newLocalityRequests.size)) {
-          newLocalityRequests += createContainerRequest(resource, null, null)
-        }
-      } else {
-        val numToCancel = newLocalityRequests.size - availableContainers
-        // cancel some requests without locality preferences to schedule more local containers
-        anyHostRequests.slice(0, numToCancel).foreach { nonLocal =>
-          amClient.removeContainerRequest(nonLocal)
-        }
-        if (numToCancel > 0) {
-          logInfo(s"Canceled $numToCancel unlocalized container requests to resubmit with locality")
+        val newLocalityRequests = new mutable.ArrayBuffer[ContainerRequest]
+        containerLocalityPreferences.foreach {
+          case ContainerLocalityPreferences(nodes, racks) if nodes != null =>
+            newLocalityRequests += createContainerRequest(resource, nodes, racks, rpId)
+          case _ =>
         }
-      }
 
-      newLocalityRequests.foreach { request =>
-        amClient.addContainerRequest(request)
-      }
+        if (availableContainers >= newLocalityRequests.size) {
+          // more containers are available than needed for locality, fill in requests for any host
+          for (i <- 0 until (availableContainers - newLocalityRequests.size)) {
+            newLocalityRequests += createContainerRequest(resource, null, null, rpId)
+          }
+        } else {
+          val numToCancel = newLocalityRequests.size - availableContainers
+          // cancel some requests without locality preferences to schedule more local containers
+          anyHostRequests.slice(0, numToCancel).foreach { nonLocal =>
+            amClient.removeContainerRequest(nonLocal)
+          }
+          if (numToCancel > 0) {
+            logInfo(s"Canceled $numToCancel unlocalized container requests to " +
+              s"resubmit with locality")
+          }
+        }
 
-      if (log.isInfoEnabled()) {
-        val (localized, anyHost) = newLocalityRequests.partition(_.getNodes() != null)
-        if (anyHost.nonEmpty) {
-          logInfo(s"Submitted ${anyHost.size} unlocalized container requests.")
+        newLocalityRequests.foreach { request =>
+          amClient.addContainerRequest(request)
         }
-        localized.foreach { request =>
-          logInfo(s"Submitted container request for host ${hostStr(request)}.")
+
+        if (log.isInfoEnabled()) {
+          val (localized, anyHost) = newLocalityRequests.partition(_.getNodes() != null)
+          if (anyHost.nonEmpty) {
+            logInfo(s"Submitted ${anyHost.size} unlocalized container requests.")
+          }
+          localized.foreach { request =>
+            logInfo(s"Submitted container request for host ${hostStr(request)}.")
+          }
         }
+      } else if (numPendingAllocate > 0 && missing < 0) {
+        val numToCancel = math.min(numPendingAllocate, -missing)
+        logInfo(s"Canceling requests for $numToCancel executor container(s) to have a new " +
+          s"desired total ${getOrUpdateTargetNumExecutorsForRPId(rpId)} executors.")
+        // cancel pending allocate requests by taking locality preference into account
+        val cancelRequests = (staleRequests ++ anyHostRequests ++ localRequests).take(numToCancel)
+        cancelRequests.foreach(amClient.removeContainerRequest)
       }
-    } else if (numPendingAllocate > 0 && missing < 0) {
-      val numToCancel = math.min(numPendingAllocate, -missing)
-      logInfo(s"Canceling requests for $numToCancel executor container(s) to have a new desired " +
-        s"total $targetNumExecutors executors.")
-      // cancel pending allocate requests by taking locality preference into account
-      val cancelRequests = (staleRequests ++ anyHostRequests ++ localRequests).take(numToCancel)
-      cancelRequests.foreach(amClient.removeContainerRequest)
     }
   }
 
@@ -405,8 +574,10 @@ private[yarn] class YarnAllocator(
   private def createContainerRequest(
       resource: Resource,
       nodes: Array[String],
-      racks: Array[String]): ContainerRequest = {
-    new ContainerRequest(resource, nodes, racks, RM_REQUEST_PRIORITY, true, labelExpression.orNull)
+      racks: Array[String],
+      rpId: Int): ContainerRequest = {
+    new ContainerRequest(resource, nodes, racks, getContainerPriority(rpId),
+      true, labelExpression.orNull)
   }
 
   /**
@@ -499,20 +670,17 @@ private[yarn] class YarnAllocator(
       location: String,
       containersToUse: ArrayBuffer[Container],
       remaining: ArrayBuffer[Container]): Unit = {
-    // SPARK-6050: certain Yarn configurations return a virtual core count that doesn't match the
-    // request; for example, capacity scheduler + DefaultResourceCalculator. So match on requested
-    // memory, but use the asked vcore count for matching, effectively disabling matching on vcore
-    // count.
-    val matchingResource = Resource.newInstance(allocatedContainer.getResource.getMemory,
-      resource.getVirtualCores)
-
-    ResourceRequestHelper.setResourceRequests(executorResourceRequests, matchingResource)
+    // Match on the exact resource we requested so there shouldn't be a mismatch,
+    // we are relying on YARN to return a container with resources no less then we requested.
+    // If we change this, or starting validating the container, be sure the logic covers SPARK-6050.
+    val rpId = getResourceProfileIdFromPriority(allocatedContainer.getPriority)
+    val resourceForRP = rpIdToYarnResource.get(rpId)
 
     logDebug(s"Calling amClient.getMatchingRequests with parameters: " +
         s"priority: ${allocatedContainer.getPriority}, " +
-        s"location: $location, resource: $matchingResource")
+        s"location: $location, resource: $resourceForRP")
     val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location,
-      matchingResource)
+      resourceForRP)
 
     // Match the allocation to a request
     if (!matchingRequests.isEmpty) {
@@ -528,30 +696,38 @@ private[yarn] class YarnAllocator(
   /**
    * Launches executors in the allocated containers.
    */
-  private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = {
+  private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = synchronized {
     for (container <- containersToUse) {
+      val rpId = getResourceProfileIdFromPriority(container.getPriority)
       executorIdCounter += 1
       val executorHostname = container.getNodeId.getHost
       val containerId = container.getId
       val executorId = executorIdCounter.toString
-      assert(container.getResource.getMemory >= resource.getMemory)
+      val yarnResourceForRpId = rpIdToYarnResource.get(rpId)
+      assert(container.getResource.getMemory >= yarnResourceForRpId.getMemory)
       logInfo(s"Launching container $containerId on host $executorHostname " +
-        s"for executor with ID $executorId")
+        s"for executor with ID $executorId for ResourceProfile Id $rpId")
 
       def updateInternalState(): Unit = synchronized {
-        runningExecutors.add(executorId)
-        numExecutorsStarting.decrementAndGet()
+        getOrUpdateRunningExecutorForRPId(rpId).add(executorId)
+        getOrUpdateNumExecutorsStartingForRPId(rpId).decrementAndGet()
         executorIdToContainer(executorId) = container
-        containerIdToExecutorId(container.getId) = executorId
+        containerIdToExecutorIdAndResourceProfileId(container.getId) = (executorId, rpId)
 
-        val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
+        val localallocatedHostToContainersMap = getOrUpdateAllocatedHostToContainersMapForRPId(rpId)
+        val containerSet = localallocatedHostToContainersMap.getOrElseUpdate(executorHostname,
           new HashSet[ContainerId])
         containerSet += containerId
         allocatedContainerToHostMap.put(containerId, executorHostname)
       }
 
-      if (runningExecutors.size() < targetNumExecutors) {
-        numExecutorsStarting.incrementAndGet()
+      val rp = rpIdToResourceProfile(rpId)
+      val containerMem = rp.executorResources.get(ResourceProfile.MEMORY).
+        map(_.amount.toInt).getOrElse(executorMemory)
+      val containerCores = rp.getExecutorCores.getOrElse(defaultExecutorCores)
+      val rpRunningExecs = getOrUpdateRunningExecutorForRPId(rpId).size
+      if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId)) {
+        getOrUpdateNumExecutorsStartingForRPId(rpId).incrementAndGet()
         if (launchContainers) {
           launcherPool.execute(() => {
             try {
@@ -562,17 +738,17 @@ private[yarn] class YarnAllocator(
                 driverUrl,
                 executorId,
                 executorHostname,
-                executorMemory,
-                executorCores,
+                containerMem,
+                containerCores,
                 appAttemptId.getApplicationId.toString,
                 securityMgr,
                 localResources,
-                ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID // use until fully supported
+                rp.id
               ).run()
               updateInternalState()
             } catch {
               case e: Throwable =>
-                numExecutorsStarting.decrementAndGet()
+                getOrUpdateNumExecutorsStartingForRPId(rpId).decrementAndGet()
                 if (NonFatal(e)) {
                   logError(s"Failed to launch executor $executorId on container $containerId", e)
                   // Assigned container should be released immediately
@@ -589,24 +765,28 @@ private[yarn] class YarnAllocator(
         }
       } else {
         logInfo(("Skip launching executorRunnable as running executors count: %d " +
-          "reached target executors count: %d.").format(
-          runningExecutors.size, targetNumExecutors))
+          "reached target executors count: %d.").format(rpRunningExecs,
+          getOrUpdateTargetNumExecutorsForRPId(rpId)))
       }
     }
   }
 
   // Visible for testing.
-  private[yarn] def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
+  private[yarn] def processCompletedContainers(
+      completedContainers: Seq[ContainerStatus]): Unit = synchronized {
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
+      val (_, rpId) = containerIdToExecutorIdAndResourceProfileId.getOrElse(containerId,
+        ("", DEFAULT_RESOURCE_PROFILE_ID))
       val alreadyReleased = releasedContainers.remove(containerId)
       val hostOpt = allocatedContainerToHostMap.get(containerId)
       val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
       val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
-        containerIdToExecutorId.get(containerId) match {
-          case Some(executorId) => runningExecutors.remove(executorId)
+        containerIdToExecutorIdAndResourceProfileId.get(containerId) match {
+          case Some((executorId, _)) =>
+            getOrUpdateRunningExecutorForRPId(rpId).remove(executorId)
           case None => logWarning(s"Cannot find executorId for container: ${containerId.toString}")
         }
 
@@ -679,19 +859,19 @@ private[yarn] class YarnAllocator(
 
       for {
         host <- hostOpt
-        containerSet <- allocatedHostToContainersMap.get(host)
+        containerSet <- getOrUpdateAllocatedHostToContainersMapForRPId(rpId).get(host)
       } {
         containerSet.remove(containerId)
         if (containerSet.isEmpty) {
-          allocatedHostToContainersMap.remove(host)
+          getOrUpdateAllocatedHostToContainersMapForRPId(rpId).remove(host)
         } else {
-          allocatedHostToContainersMap.update(host, containerSet)
+          getOrUpdateAllocatedHostToContainersMapForRPId(rpId).update(host, containerSet)
         }
 
         allocatedContainerToHostMap.remove(containerId)
       }
 
-      containerIdToExecutorId.remove(containerId).foreach { eid =>
+      containerIdToExecutorIdAndResourceProfileId.remove(containerId).foreach { case (eid, _) =>
         executorIdToContainer.remove(eid)
         pendingLossReasonRequests.remove(eid) match {
           case Some(pendingRequests) =>
@@ -737,12 +917,14 @@ private[yarn] class YarnAllocator(
     }
   }
 
-  private def internalReleaseContainer(container: Container): Unit = {
+  private def internalReleaseContainer(container: Container): Unit = synchronized {
     releasedContainers.add(container.getId())
     amClient.releaseAssignedContainer(container.getId())
   }
 
-  private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
+  private[yarn] def getNumUnexpectedContainerRelease: Long = synchronized {
+    numUnexpectedContainerRelease
+  }
 
   private[yarn] def getNumPendingLossReasonRequests: Int = synchronized {
     pendingLossReasonRequests.size
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index f8bbc39c8bcc5..e428bab4f96f3 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -130,12 +130,8 @@ private[spark] abstract class YarnSchedulerBackend(
     val filteredRPHostToLocalTaskCount = rpHostToLocalTaskCount.map { case (rpid, v) =>
       (rpid, v.filter { case (host, count) => !nodeBlacklist.contains(host) })
     }
-    // TODO - default everything to default profile until YARN pieces
-    val defaultProf = ResourceProfile.getOrCreateDefaultProfile(conf)
-    val hostToLocalTaskCount = filteredRPHostToLocalTaskCount.getOrElse(defaultProf.id, Map.empty)
-    val localityAwareTasks = numLocalityAwareTasksPerResourceProfileId.getOrElse(defaultProf.id, 0)
-    val numExecutors = resourceProfileToTotalExecs.getOrElse(defaultProf, 0)
-    RequestExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount, nodeBlacklist)
+    RequestExecutors(resourceProfileToTotalExecs, numLocalityAwareTasksPerResourceProfileId,
+      filteredRPHostToLocalTaskCount, nodeBlacklist)
   }
 
   /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
index 29f1c0512fbd5..d83a0d2efec1f 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.deploy.yarn
 
+import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
+import org.apache.spark.SparkConf
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.resource.ResourceProfile
 
 class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
 
@@ -28,7 +31,7 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
   import yarnAllocatorSuite._
 
   def createContainerRequest(nodes: Array[String]): ContainerRequest =
-    new ContainerRequest(containerResource, nodes, null, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
+    new ContainerRequest(containerResource, nodes, null, Priority.newInstance(1))
 
   override def beforeEach(): Unit = {
     yarnAllocatorSuite.beforeEach()
@@ -38,18 +41,22 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     yarnAllocatorSuite.afterEach()
   }
 
+  val defaultResourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+
   test("allocate locality preferred containers with enough resource and no matched existed " +
     "containers") {
     // 1. All the locations of current containers cannot satisfy the new requirements
     // 2. Current requested container number can fully satisfy the pending tasks.
 
-    val handler = createAllocator(2)
+    val (handler, allocatorConf) = createAllocator(2)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
       3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10),
-        handler.allocatedHostToContainersMap, Seq.empty)
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId), Seq.empty, rp)
 
     assert(localities.map(_.nodes) === Array(
       Array("host3", "host4", "host5"),
@@ -62,7 +69,7 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     // 1. Parts of current containers' locations can satisfy the new requirements
     // 2. Current requested container number can fully satisfy the pending tasks.
 
-    val handler = createAllocator(3)
+    val (handler, allocatorConf) = createAllocator(3)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(
       createContainer("host1"),
@@ -70,9 +77,12 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
       createContainer("host2")
     ))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
+
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
       3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
-        handler.allocatedHostToContainersMap, Seq.empty)
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId), Seq.empty, rp)
 
     assert(localities.map(_.nodes) ===
       Array(null, Array("host2", "host3"), Array("host2", "host3")))
@@ -83,7 +93,7 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
     // 1. Parts of current containers' locations can satisfy the new requirements
     // 2. Current requested container number cannot fully satisfy the pending tasks.
 
-    val handler = createAllocator(3)
+    val (handler, allocatorConf) = createAllocator(3)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(
       createContainer("host1"),
@@ -91,9 +101,11 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
       createContainer("host2")
     ))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
       1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
-        handler.allocatedHostToContainersMap, Seq.empty)
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId), Seq.empty, rp)
 
     assert(localities.map(_.nodes) === Array(Array("host2", "host3")))
   }
@@ -101,7 +113,7 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
   test("allocate locality preferred containers with fully matched containers") {
     // Current containers' locations can fully satisfy the new requirements
 
-    val handler = createAllocator(5)
+    val (handler, allocatorConf) = createAllocator(5)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(
       createContainer("host1"),
@@ -111,9 +123,11 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
       createContainer("host3")
     ))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
       3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
-        handler.allocatedHostToContainersMap, Seq.empty)
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId), Seq.empty, rp)
 
     assert(localities.map(_.nodes) === Array(null, null, null))
   }
@@ -121,18 +135,21 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
   test("allocate containers with no locality preference") {
     // Request new container without locality preference
 
-    val handler = createAllocator(2)
+    val (handler, allocatorConf) = createAllocator(2)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
-      1, 0, Map.empty, handler.allocatedHostToContainersMap, Seq.empty)
+      1, 0, Map.empty,
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId), Seq.empty, rp)
 
     assert(localities.map(_.nodes) === Array(null))
   }
 
   test("allocate locality preferred containers by considering the localities of pending requests") {
-    val handler = createAllocator(3)
+    val (handler, allocatorConf) = createAllocator(3)
     handler.updateResourceRequests()
     handler.handleAllocatedContainers(Array(
       createContainer("host1"),
@@ -144,9 +161,12 @@ class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with B
       createContainerRequest(Array("host2", "host3")),
       createContainerRequest(Array("host1", "host4")))
 
+    ResourceProfile.clearDefaultProfile
+    val rp = ResourceProfile.getOrCreateDefaultProfile(allocatorConf)
     val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
       1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
-        handler.allocatedHostToContainersMap, pendingAllocationRequests)
+      handler.allocatedHostToContainersMapPerRPId(defaultResourceProfileId),
+      pendingAllocationRequests, rp)
 
     assert(localities.map(_.nodes) === Array(Array("host3")))
   }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala
index b7f25656e49ac..727851747e088 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.mockito.Mockito._
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.resource.ResourceProfile
 
 class LocalityPlacementStrategySuite extends SparkFunSuite {
 
@@ -58,7 +59,7 @@ class LocalityPlacementStrategySuite extends SparkFunSuite {
 
     val resource = Resource.newInstance(8 * 1024, 4)
     val strategy = new LocalityPreferredContainerPlacementStrategy(new SparkConf(),
-      yarnConf, resource, new MockResolver())
+      yarnConf, new MockResolver())
 
     val totalTasks = 32 * 1024
     val totalContainers = totalTasks / 16
@@ -75,9 +76,10 @@ class LocalityPlacementStrategySuite extends SparkFunSuite {
       containers.drop(count * i).take(i).foreach { c => hostContainers += c }
       hostToContainerMap(host) = hostContainers
     }
+    val rp = ResourceProfile.getOrCreateDefaultProfile(new SparkConf)
 
     strategy.localityOfRequestedContainers(containers.size * 2, totalTasks, hosts,
-      hostToContainerMap, Nil)
+      hostToContainerMap, Nil, rp)
   }
 
 }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 6216d473882e6..2003d0bb87b2d 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.deploy.yarn
 import java.util.Collections
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
@@ -32,9 +33,9 @@ import org.scalatest.{BeforeAndAfterEach, Matchers}
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.yarn.ResourceRequestHelper._
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.config._
+import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfile, TaskResourceRequests}
 import org.apache.spark.resource.ResourceUtils.{AMOUNT, GPU}
 import org.apache.spark.resource.TestResourceIDs._
 import org.apache.spark.rpc.RpcEndpointRef
@@ -69,6 +70,11 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
   var containerNum = 0
 
+  // priority has to be 0 to match default profile id
+  val RM_REQUEST_PRIORITY = Priority.newInstance(0)
+  val defaultRPId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
+  val defaultRP = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+
   override def beforeEach(): Unit = {
     super.beforeEach()
     rmClient = AMRMClient.createAMRMClient()
@@ -93,7 +99,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
   def createAllocator(
       maxExecutors: Int = 5,
       rmClient: AMRMClient[ContainerRequest] = rmClient,
-      additionalConfigs: Map[String, String] = Map()): YarnAllocator = {
+      additionalConfigs: Map[String, String] = Map()): (YarnAllocator, SparkConf) = {
     val args = Array(
       "--jar", "somejar.jar",
       "--class", "SomeClass")
@@ -107,7 +113,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       sparkConfClone.set(name, value)
     }
 
-    new YarnAllocator(
+    val allocator = new YarnAllocator(
       "not used",
       mock(classOf[RpcEndpointRef]),
       conf,
@@ -118,16 +124,18 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       Map(),
       new MockResolver(),
       clock)
+    (allocator, sparkConfClone)
   }
 
   def createContainer(
       host: String,
       containerNumber: Int = containerNum,
-      resource: Resource = containerResource): Container = {
+      resource: Resource = containerResource,
+      priority: Priority = RM_REQUEST_PRIORITY): Container = {
     val  containerId: ContainerId = ContainerId.newContainerId(appAttemptId, containerNum)
     containerNum += 1
     val nodeId = NodeId.newInstance(host, 1000)
-    Container.newInstance(containerId, nodeId, "", resource, RM_REQUEST_PRIORITY, null)
+    Container.newInstance(containerId, nodeId, "", resource, priority, null)
   }
 
   def createContainers(hosts: Seq[String], containerIds: Seq[Int]): Seq[Container] = {
@@ -145,20 +153,108 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
   test("single container allocated") {
     // request a single container and receive it
-    val handler = createAllocator(1)
+    val (handler, _) = createAllocator(1)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (1)
+    handler.getNumContainersPendingAllocate should be (1)
 
     val container = createContainer("host1")
     handler.handleAllocatedContainers(Array(container))
 
     handler.getNumExecutorsRunning should be (1)
     handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(defaultRPId)
+    hostTocontainer.get("host1").get should contain(container.getId)
+
+    val size = rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size
+    size should be (0)
+  }
+
+  test("single container allocated with ResourceProfile") {
+    assume(isYarnResourceTypesAvailable())
+    val yarnResources = Seq(YARN_GPU_RESOURCE_CONFIG)
+    ResourceRequestTestHelper.initializeResourceTypes(yarnResources)
+    // create default profile so we get a different id to test below
+    val defaultRProf = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+    val execReq = new ExecutorResourceRequests().resource("gpu", 6)
+    val taskReq = new TaskResourceRequests().resource("gpu", 1)
+    val rprof = new ResourceProfile(execReq.requests, taskReq.requests)
+    // request a single container and receive it
+    val (handler, _) = createAllocator(0)
+
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRProf -> 0, rprof -> 1)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(rprof.id -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
+
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumContainersPendingAllocate should be (1)
+
+    val container = createContainer("host1", priority = Priority.newInstance(rprof.id))
+    handler.handleAllocatedContainers(Array(container))
+
+    handler.getNumExecutorsRunning should be (1)
+    handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(rprof.id)
+    hostTocontainer.get("host1").get should contain(container.getId)
 
     val size = rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size
     size should be (0)
+
+    ResourceProfile.reInitDefaultProfile(sparkConf)
+  }
+
+  test("multiple containers allocated with ResourceProfiles") {
+    assume(isYarnResourceTypesAvailable())
+    val yarnResources = Seq(YARN_GPU_RESOURCE_CONFIG, YARN_FPGA_RESOURCE_CONFIG)
+    ResourceRequestTestHelper.initializeResourceTypes(yarnResources)
+    // create default profile so we get a different id to test below
+    val defaultRProf = ResourceProfile.getOrCreateDefaultProfile(sparkConf)
+    val execReq = new ExecutorResourceRequests().resource("gpu", 6)
+    val taskReq = new TaskResourceRequests().resource("gpu", 1)
+    val rprof = new ResourceProfile(execReq.requests, taskReq.requests)
+
+    val execReq2 = new ExecutorResourceRequests().memory("8g").resource("fpga", 2)
+    val taskReq2 = new TaskResourceRequests().resource("fpga", 1)
+    val rprof2 = new ResourceProfile(execReq2.requests, taskReq2.requests)
+
+
+    // request a single container and receive it
+    val (handler, _) = createAllocator(1)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRProf -> 0, rprof -> 1, rprof2 -> 2)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(rprof.id -> 0, rprof2.id -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
+
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumContainersPendingAllocate should be (3)
+
+    val containerResourcerp2 = Resource.newInstance(10240, 5)
+
+    val container = createContainer("host1", priority = Priority.newInstance(rprof.id))
+    val container2 = createContainer("host2", resource = containerResourcerp2,
+      priority = Priority.newInstance(rprof2.id))
+    val container3 = createContainer("host3", resource = containerResourcerp2,
+      priority = Priority.newInstance(rprof2.id))
+    handler.handleAllocatedContainers(Array(container, container2, container3))
+
+    handler.getNumExecutorsRunning should be (3)
+    handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+    handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host2")
+    handler.allocatedContainerToHostMap.get(container3.getId).get should be ("host3")
+
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(rprof.id)
+    hostTocontainer.get("host1").get should contain(container.getId)
+    val hostTocontainer2 = handler.allocatedHostToContainersMapPerRPId(rprof2.id)
+    hostTocontainer2.get("host2").get should contain(container2.getId)
+    hostTocontainer2.get("host3").get should contain(container3.getId)
+
+    val size = rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size
+    size should be (0)
+
+    ResourceProfile.reInitDefaultProfile(sparkConf)
   }
 
   test("custom resource requested from yarn") {
@@ -166,16 +262,16 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     ResourceRequestTestHelper.initializeResourceTypes(List("gpu"))
 
     val mockAmClient = mock(classOf[AMRMClient[ContainerRequest]])
-    val handler = createAllocator(1, mockAmClient,
+    val (handler, _) = createAllocator(1, mockAmClient,
       Map(s"${YARN_EXECUTOR_RESOURCE_TYPES_PREFIX}${GPU}.${AMOUNT}" -> "2G"))
 
     handler.updateResourceRequests()
-    val container = createContainer("host1", resource = handler.resource)
+    val container = createContainer("host1", resource = handler.defaultResource)
     handler.handleAllocatedContainers(Array(container))
 
     // get amount of memory and vcores from resource, so effectively skipping their validation
-    val expectedResources = Resource.newInstance(handler.resource.getMemory(),
-      handler.resource.getVirtualCores)
+    val expectedResources = Resource.newInstance(handler.defaultResource.getMemory(),
+      handler.defaultResource.getVirtualCores)
     setResourceRequests(Map("gpu" -> "2G"), expectedResources)
     val captor = ArgumentCaptor.forClass(classOf[ContainerRequest])
 
@@ -195,10 +291,10 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       Map(EXECUTOR_GPU_ID.amountConf -> "3",
         EXECUTOR_FPGA_ID.amountConf -> "2",
         madeupConfigName -> "5")
-    val handler = createAllocator(1, mockAmClient, sparkResources)
+    val (handler, _) = createAllocator(1, mockAmClient, sparkResources)
 
     handler.updateResourceRequests()
-    val yarnRInfo = ResourceRequestTestHelper.getResources(handler.resource)
+    val yarnRInfo = ResourceRequestTestHelper.getResources(handler.defaultResource)
     val allResourceInfo = yarnRInfo.map( rInfo => (rInfo.name -> rInfo.value) ).toMap
     assert(allResourceInfo.get(YARN_GPU_RESOURCE_CONFIG).nonEmpty)
     assert(allResourceInfo.get(YARN_GPU_RESOURCE_CONFIG).get === 3)
@@ -210,17 +306,18 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
   test("container should not be created if requested number if met") {
     // request a single container and receive it
-    val handler = createAllocator(1)
+    val (handler, _) = createAllocator(1)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (1)
+    handler.getNumContainersPendingAllocate should be (1)
 
     val container = createContainer("host1")
     handler.handleAllocatedContainers(Array(container))
 
     handler.getNumExecutorsRunning should be (1)
     handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(defaultRPId)
+    hostTocontainer.get("host1").get should contain(container.getId)
 
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container2))
@@ -229,10 +326,10 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
   test("some containers allocated") {
     // request a few containers and receive some of them
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host1")
@@ -243,16 +340,17 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
     handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host1")
     handler.allocatedContainerToHostMap.get(container3.getId).get should be ("host2")
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container2.getId)
-    handler.allocatedHostToContainersMap.get("host2").get should contain (container3.getId)
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(defaultRPId)
+    hostTocontainer.get("host1").get should contain(container1.getId)
+    hostTocontainer.get("host1").get should contain (container2.getId)
+    hostTocontainer.get("host2").get should contain (container3.getId)
   }
 
   test("receive more containers than requested") {
-    val handler = createAllocator(2)
+    val (handler, _) = createAllocator(2)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (2)
+    handler.getNumContainersPendingAllocate should be (2)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -263,42 +361,52 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
     handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host2")
     handler.allocatedContainerToHostMap.contains(container3.getId) should be (false)
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
-    handler.allocatedHostToContainersMap.get("host2").get should contain (container2.getId)
-    handler.allocatedHostToContainersMap.contains("host4") should be (false)
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(defaultRPId)
+    hostTocontainer.get("host1").get should contain(container1.getId)
+    hostTocontainer.get("host2").get should contain (container2.getId)
+    hostTocontainer.contains("host4") should be (false)
   }
 
   test("decrease total requested executors") {
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
-    handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty, Set.empty)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 3)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.updateResourceRequests()
-    handler.getPendingAllocate.size should be (3)
+    handler.getNumContainersPendingAllocate should be (3)
 
     val container = createContainer("host1")
     handler.handleAllocatedContainers(Array(container))
 
     handler.getNumExecutorsRunning should be (1)
     handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
-    handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+    val hostTocontainer = handler.allocatedHostToContainersMapPerRPId(defaultRPId)
+    hostTocontainer.get("host1").get should contain(container.getId)
 
-    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty, Set.empty)
+    resourceProfileToTotalExecs(defaultRP) = 2
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.updateResourceRequests()
-    handler.getPendingAllocate.size should be (1)
+    handler.getNumContainersPendingAllocate should be (1)
   }
 
   test("decrease total requested executors to less than currently running") {
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
-    handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty, Set.empty)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 3)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.updateResourceRequests()
-    handler.getPendingAllocate.size should be (3)
+    handler.getNumContainersPendingAllocate should be (3)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
@@ -306,23 +414,28 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
     handler.getNumExecutorsRunning should be (2)
 
-    handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty, Set.empty)
+    resourceProfileToTotalExecs(defaultRP) = 1
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.updateResourceRequests()
-    handler.getPendingAllocate.size should be (0)
+    handler.getNumContainersPendingAllocate should be (0)
     handler.getNumExecutorsRunning should be (2)
   }
 
   test("kill executors") {
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container1, container2))
 
-    handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty, Set.empty)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 1)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.executorIdToContainer.keys.foreach { id => handler.killExecutor(id ) }
 
     val statuses = Seq(container1, container2).map { c =>
@@ -331,20 +444,20 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.updateResourceRequests()
     handler.processCompletedContainers(statuses)
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (1)
+    handler.getNumContainersPendingAllocate should be (1)
   }
 
   test("kill same executor multiple times") {
-    val handler = createAllocator(2)
+    val (handler, _) = createAllocator(2)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (2)
+    handler.getNumContainersPendingAllocate should be (2)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container1, container2))
     handler.getNumExecutorsRunning should be (2)
-    handler.getPendingAllocate.size should be (0)
+    handler.getNumContainersPendingAllocate should be (0)
 
     val executorToKill = handler.executorIdToContainer.keys.head
     handler.killExecutor(executorToKill)
@@ -353,22 +466,25 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.killExecutor(executorToKill)
     handler.killExecutor(executorToKill)
     handler.getNumExecutorsRunning should be (1)
-    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty, Set.empty)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 2)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map.empty, Set.empty)
     handler.updateResourceRequests()
-    handler.getPendingAllocate.size should be (1)
+    handler.getNumContainersPendingAllocate should be (1)
   }
 
   test("process same completed container multiple times") {
-    val handler = createAllocator(2)
+    val (handler, _) = createAllocator(2)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (2)
+    handler.getNumContainersPendingAllocate should be (2)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container1, container2))
     handler.getNumExecutorsRunning should be (2)
-    handler.getPendingAllocate.size should be (0)
+    handler.getNumContainersPendingAllocate should be (0)
 
     val statuses = Seq(container1, container1, container2).map { c =>
       ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Finished", 0)
@@ -379,16 +495,19 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
   }
 
   test("lost executor removed from backend") {
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
     val container1 = createContainer("host1")
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container1, container2))
 
-    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map(), Set.empty)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 2)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map(), Set.empty)
 
     val statuses = Seq(container1, container2).map { c =>
       ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Failed", -1)
@@ -397,7 +516,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.processCompletedContainers(statuses)
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (2)
+    handler.getNumContainersPendingAllocate should be (2)
     handler.getNumExecutorsFailed should be (2)
     handler.getNumUnexpectedContainerRelease should be (2)
   }
@@ -406,28 +525,35 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     // Internally we track the set of blacklisted nodes, but yarn wants us to send *changes*
     // to the blacklist.  This makes sure we are sending the right updates.
     val mockAmClient = mock(classOf[AMRMClient[ContainerRequest]])
-    val handler = createAllocator(4, mockAmClient)
-    handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map(), Set("hostA"))
+    val (handler, _) = createAllocator(4, mockAmClient)
+    val resourceProfileToTotalExecs = mutable.HashMap(defaultRP -> 1)
+    val numLocalityAwareTasksPerResourceProfileId = mutable.HashMap(defaultRPId -> 0)
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map(), Set("hostA"))
     verify(mockAmClient).updateBlacklist(Seq("hostA").asJava, Seq[String]().asJava)
 
     val blacklistedNodes = Set(
       "hostA",
       "hostB"
     )
-    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map(), blacklistedNodes)
-    verify(mockAmClient).updateBlacklist(Seq("hostB").asJava, Seq[String]().asJava)
 
-    handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map(), Set.empty)
+    resourceProfileToTotalExecs(defaultRP) = 2
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map(), blacklistedNodes)
+    verify(mockAmClient).updateBlacklist(Seq("hostB").asJava, Seq[String]().asJava)
+    resourceProfileToTotalExecs(defaultRP) = 3
+    handler.requestTotalExecutorsWithPreferredLocalities(resourceProfileToTotalExecs.toMap,
+      numLocalityAwareTasksPerResourceProfileId.toMap, Map(), Set.empty)
     verify(mockAmClient).updateBlacklist(Seq[String]().asJava, Seq("hostA", "hostB").asJava)
   }
 
   test("window based failure executor counting") {
     sparkConf.set(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS, 100 * 1000L)
-    val handler = createAllocator(4)
+    val (handler, _) = createAllocator(4)
 
     handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
-    handler.getPendingAllocate.size should be (4)
+    handler.getNumContainersPendingAllocate should be (4)
 
     val containers = Seq(
       createContainer("host1"),
@@ -468,7 +594,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val rmClientSpy = spy(rmClient)
     val maxExecutors = 11
 
-    val handler = createAllocator(
+    val (handler, _) = createAllocator(
       maxExecutors,
       rmClientSpy,
       Map(
@@ -525,9 +651,9 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     try {
       sparkConf.set(MEMORY_OFFHEAP_ENABLED, true)
       sparkConf.set(MEMORY_OFFHEAP_SIZE, offHeapMemoryInByte)
-      val allocator = createAllocator(maxExecutors = 1,
+      val (handler, _) = createAllocator(maxExecutors = 1,
         additionalConfigs = Map(EXECUTOR_MEMORY.key -> executorMemory.toString))
-      val memory = allocator.resource.getMemory
+      val memory = handler.defaultResource.getMemory
       assert(memory ==
         executorMemory + offHeapMemoryInMB + YarnSparkHadoopUtil.MEMORY_OVERHEAD_MIN)
     } finally {
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
index c0c6fff5130bb..9003c2f630975 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
@@ -51,9 +51,8 @@ class YarnSchedulerBackendSuite extends SparkFunSuite with MockitoSugar with Loc
 
   private class TestYarnSchedulerBackend(scheduler: TaskSchedulerImpl, sc: SparkContext)
       extends YarnSchedulerBackend(scheduler, sc) {
-    def setHostToLocalTaskCount(hostToLocalTaskCount: Map[String, Int]): Unit = {
-      this.rpHostToLocalTaskCount = Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID ->
-        hostToLocalTaskCount)
+    def setHostToLocalTaskCount(hostToLocalTaskCount: Map[Int, Map[String, Int]]): Unit = {
+      this.rpHostToLocalTaskCount = hostToLocalTaskCount
     }
   }
 
@@ -64,21 +63,24 @@ class YarnSchedulerBackendSuite extends SparkFunSuite with MockitoSugar with Loc
     val yarnSchedulerBackendExtended = new TestYarnSchedulerBackend(sched, sc)
     yarnSchedulerBackend = yarnSchedulerBackendExtended
     val ser = new JavaSerializer(sc.conf).newInstance()
+    val defaultResourceProf = ResourceProfile.getOrCreateDefaultProfile(sc.getConf)
     for {
       blacklist <- IndexedSeq(Set[String](), Set("a", "b", "c"))
       numRequested <- 0 until 10
       hostToLocalCount <- IndexedSeq(
-        Map[String, Int](),
-        Map("a" -> 1, "b" -> 2)
+        Map(defaultResourceProf.id -> Map.empty[String, Int]),
+        Map(defaultResourceProf.id -> Map("a" -> 1, "b" -> 2))
       )
     } {
       yarnSchedulerBackendExtended.setHostToLocalTaskCount(hostToLocalCount)
       sched.setNodeBlacklist(blacklist)
-      val numReq = Map(ResourceProfile.getOrCreateDefaultProfile(sc.getConf) -> numRequested)
-      val req = yarnSchedulerBackendExtended.prepareRequestExecutors(numReq)
-      assert(req.requestedTotal === numRequested)
+      val request = Map(defaultResourceProf -> numRequested)
+      val req = yarnSchedulerBackendExtended.prepareRequestExecutors(request)
+      assert(req.resourceProfileToTotalExecs(defaultResourceProf) === numRequested)
       assert(req.nodeBlacklist === blacklist)
-      assert(req.hostToLocalTaskCount.keySet.intersect(blacklist).isEmpty)
+      val hosts =
+        req.hostToLocalTaskCount(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID).keySet
+      assert(hosts.intersect(blacklist).isEmpty)
       // Serialize to make sure serialization doesn't throw an error
       ser.serialize(req)
     }