From bb44edcabbaf41edaf13c7bcd6dd62b246db7140 Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Tue, 23 Feb 2021 01:37:53 -0800 Subject: [PATCH 1/4] ADD JAR with ivy coordinates should be compatible with Hive behavior --- .../apache/spark/util/DependencyUtils.scala | 13 ++++---- .../org/apache/spark/SparkContextSuite.scala | 33 +++++++++++++------ ...ql-ref-syntax-aux-resource-mgmt-add-jar.md | 2 +- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala index 60e866a556796..9f3b4da6181f9 100644 --- a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala @@ -59,8 +59,8 @@ private[spark] object DependencyUtils extends Logging { * @param uri Ivy URI need to be downloaded. * @return Tuple value of parameter `transitive` and `exclude` value. * - * 1. transitive: whether to download dependency jar of Ivy URI, default value is false - * and this parameter value is case-sensitive. Invalid value will be treat as false. + * 1. transitive: whether to download dependency jar of Ivy URI, default value is true + * and this parameter value is case-insensitive. Invalid value will be treat as false. * Example: Input: exclude=org.mortbay.jetty:jetty&transitive=true * Output: true * @@ -72,7 +72,7 @@ private[spark] object DependencyUtils extends Logging { private def parseQueryParams(uri: URI): (Boolean, String) = { val uriQuery = uri.getQuery if (uriQuery == null) { - (false, "") + (true, "") } else { val mapTokens = uriQuery.split("&").map(_.split("=")) if (mapTokens.exists(isInvalidQueryString)) { @@ -81,14 +81,15 @@ private[spark] object DependencyUtils extends Logging { } val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1) - // Parse transitive parameters (e.g., transitive=true) in an Ivy URI, default value is false + // Parse transitive parameters (e.g., transitive=true) in an Ivy URI, default value is true val transitiveParams = groupedParams.get("transitive") if (transitiveParams.map(_.size).getOrElse(0) > 1) { logWarning("It's best to specify `transitive` parameter in ivy URI query only once." + " If there are multiple `transitive` parameter, we will select the last one") } val transitive = - transitiveParams.flatMap(_.takeRight(1).map(_._2 == "true").headOption).getOrElse(false) + transitiveParams.flatMap(_.takeRight(1).map(_._2.equalsIgnoreCase("true")).headOption) + .getOrElse(true) // Parse an excluded list (e.g., exclude=org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http) // in an Ivy URI. When download Ivy URI jar, Spark won't download transitive jar @@ -125,7 +126,7 @@ private[spark] object DependencyUtils extends Logging { * `parameter=value¶meter=value...` * Note that currently Ivy URI query part support two parameters: * 1. transitive: whether to download dependent jars related to your Ivy URI. - * transitive=false or `transitive=true`, if not set, the default value is false. + * transitive=false or `transitive=true`, if not set, the default value is true. * 2. exclude: exclusion list when download Ivy URI jar and dependency jars. * The `exclude` parameter content is a ',' separated `group:module` pair string : * `exclude=group:module,group:module...` diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 7a4970e60e932..0ba2a030dac12 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -1035,13 +1035,10 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } - test("SPARK-33084: Add jar support Ivy URI -- default transitive = false") { + test("SPARK-33084: Add jar support Ivy URI -- default transitive = true") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0") assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) - assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) - - sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=true") assert(sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) } @@ -1083,6 +1080,22 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } + test("SPARK-34506: Add jar support Ivy URI -- transitive=false will not download " + + "dependency jars") { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=false") + assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) + assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) + } + + test("SPARK-34506: Add jar support Ivy URI -- test exclude param when transitive unspecified") { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?exclude=commons-lang:commons-lang") + assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) + assert(sc.listJars().exists(_.contains("org.slf4j_slf4j-api-1.7.10.jar"))) + assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) + } + test("SPARK-33084: Add jar support Ivy URI -- test exclude param when transitive=true") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0" + @@ -1131,24 +1144,24 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu test("SPARK-33084: Add jar support Ivy URI -- test param key case sensitive") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) - sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?TRANSITIVE=true") + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=false") assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) - sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=true") + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?TRANSITIVE=false") assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) assert(sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) } - test("SPARK-33084: Add jar support Ivy URI -- test transitive value case sensitive") { + test("SPARK-33084: Add jar support Ivy URI -- test transitive value case insensitive") { sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) - sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=TRUE") + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=FALSE") assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) - sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=true") + sc.addJar("ivy://org.apache.hive:hive-storage-api:2.7.0?transitive=false") assert(sc.listJars().exists(_.contains("org.apache.hive_hive-storage-api-2.7.0.jar"))) - assert(sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) + assert(!sc.listJars().exists(_.contains("commons-lang_commons-lang-2.6.jar"))) } test("SPARK-34346: hadoop configuration priority for spark/hive/hadoop configs") { diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md index 6d31125fd612d..f1a7377bd6ff3 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md @@ -36,7 +36,7 @@ ADD JAR file_name The name of the JAR file to be added. It could be either on a local file system or a distributed file system or an Ivy URI. Apache Ivy is a popular dependency manager focusing on flexibility and simplicity. Now we support two parameter in URI query string: - * transitive: whether to download dependent jars related to your ivy URL. It is case-sensitive and only take last one if multiple transitive parameters are specified. + * transitive: whether to download dependent jars related to your ivy URL. The parameter name is case-sensitive, parameter value is case-insensitive and only take last one if multiple transitive parameters are specified. * exclude: exclusion list during downloading Ivy URI jar and dependent jars. User can write Ivy URI such as: From 7089d52eb5922d7a465885fceccbd8efe447a867 Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Tue, 23 Feb 2021 04:10:49 -0800 Subject: [PATCH 2/4] Fix tests --- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 8 ++++---- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index e6a18c3894497..b85b64bad0240 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -3726,13 +3726,13 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark test("SPARK-33084: Add jar support Ivy URI in SQL") { val sc = spark.sparkContext val hiveVersion = "2.3.8" - // default transitive=false, only download specified jar - sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion") + // transitive=false, only download specified jar + sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion?transitive=false") assert(sc.listJars() .exists(_.contains(s"org.apache.hive.hcatalog_hive-hcatalog-core-$hiveVersion.jar"))) - // test download ivy URL jar return multiple jars - sql("ADD JAR ivy://org.scala-js:scalajs-test-interface_2.12:1.2.0?transitive=true") + // default transitive=true, test download ivy URL jar return multiple jars + sql("ADD JAR ivy://org.scala-js:scalajs-test-interface_2.12:1.2.0") assert(sc.listJars().exists(_.contains("scalajs-library_2.12"))) assert(sc.listJars().exists(_.contains("scalajs-test-interface_2.12"))) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index d4bcba4128db9..903cee743dc1a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -1224,7 +1224,10 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd test("SPARK-33084: Add jar support Ivy URI in SQL") { val testData = TestHive.getHiveFile("data/files/sample.json").toURI withTable("t") { - sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion") + // exclude org.pentaho:pentaho-aggdesigner-algorithm as this transitive dependency does + // not exist on mavencentral and hence cannot be found in the test environment + sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion" + + "?exclude=org.pentaho:pentaho-aggdesigner-algorithm") sql( """CREATE TABLE t(a string, b string) |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'""".stripMargin) From c63b605ac65bd74c6647bf59b261e6412982e1e2 Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Tue, 23 Feb 2021 14:26:44 -0800 Subject: [PATCH 3/4] Fix tests [Take 2] --- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 903cee743dc1a..87c2541dc7555 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -1224,10 +1224,12 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd test("SPARK-33084: Add jar support Ivy URI in SQL") { val testData = TestHive.getHiveFile("data/files/sample.json").toURI withTable("t") { - // exclude org.pentaho:pentaho-aggdesigner-algorithm as this transitive dependency does - // not exist on mavencentral and hence cannot be found in the test environment + // hive-catalog-core has some transitive dependencies which dont exist on maven central + // and hence cannot be found in the test environment or are non-jar (.pom) which cause + // failures in tests. Use transitive=false as it should be good enough to test the Ivy + // support in Hive ADD JAR sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion" + - "?exclude=org.pentaho:pentaho-aggdesigner-algorithm") + "?transitive=false") sql( """CREATE TABLE t(a string, b string) |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'""".stripMargin) From 61db81b54e595819d5c32f59bbdf2a1490b9fb8b Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Wed, 24 Feb 2021 13:06:30 -0800 Subject: [PATCH 4/4] Address PR comments --- .../src/main/scala/org/apache/spark/util/DependencyUtils.scala | 3 ++- docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala index 9f3b4da6181f9..f7135edd2129d 100644 --- a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala @@ -60,7 +60,8 @@ private[spark] object DependencyUtils extends Logging { * @return Tuple value of parameter `transitive` and `exclude` value. * * 1. transitive: whether to download dependency jar of Ivy URI, default value is true - * and this parameter value is case-insensitive. Invalid value will be treat as false. + * and this parameter value is case-insensitive. This mimics Hive's behaviour for + * parsing the transitive parameter. Invalid value will be treat as false. * Example: Input: exclude=org.mortbay.jetty:jetty&transitive=true * Output: true * diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md index f1a7377bd6ff3..e5ac58ba8195f 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md @@ -36,7 +36,7 @@ ADD JAR file_name The name of the JAR file to be added. It could be either on a local file system or a distributed file system or an Ivy URI. Apache Ivy is a popular dependency manager focusing on flexibility and simplicity. Now we support two parameter in URI query string: - * transitive: whether to download dependent jars related to your ivy URL. The parameter name is case-sensitive, parameter value is case-insensitive and only take last one if multiple transitive parameters are specified. + * transitive: whether to download dependent jars related to your ivy URL. The parameter name is case-sensitive, and the parameter value is case-insensitive. If multiple transitive parameters are specified, the last one wins. * exclude: exclusion list during downloading Ivy URI jar and dependent jars. User can write Ivy URI such as: