diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index d9f2d36867b2d..95aefb6422d67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.types._ object NestedColumnAliasing { def unapply(plan: LogicalPlan) - : Option[(Map[GetStructField, Alias], Map[ExprId, Seq[Alias]])] = plan match { + : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = plan match { case Project(projectList, child) if SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) => getAliasSubMap(projectList) @@ -43,7 +43,7 @@ object NestedColumnAliasing { */ def replaceToAliases( plan: LogicalPlan, - nestedFieldToAlias: Map[GetStructField, Alias], + nestedFieldToAlias: Map[ExtractValue, Alias], attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = plan match { case Project(projectList, child) => Project( @@ -56,9 +56,9 @@ object NestedColumnAliasing { */ private def getNewProjectList( projectList: Seq[NamedExpression], - nestedFieldToAlias: Map[GetStructField, Alias]): Seq[NamedExpression] = { + nestedFieldToAlias: Map[ExtractValue, Alias]): Seq[NamedExpression] = { projectList.map(_.transform { - case f: GetStructField if nestedFieldToAlias.contains(f) => + case f: ExtractValue if nestedFieldToAlias.contains(f) => nestedFieldToAlias(f).toAttribute }.asInstanceOf[NamedExpression]) } @@ -86,32 +86,39 @@ object NestedColumnAliasing { } /** - * Return root references that are individually accessed as a whole, and `GetStructField`s. + * Return root references that are individually accessed as a whole, and `GetStructField`s + * or `GetArrayStructField`s which on top of other `ExtractValue`s or special expressions. + * Check `SelectedField` to see which expressions should be listed here. */ - private def collectRootReferenceAndGetStructField(e: Expression): Seq[Expression] = e match { - case _: AttributeReference | _: GetStructField => Seq(e) - case es if es.children.nonEmpty => es.children.flatMap(collectRootReferenceAndGetStructField) + private def collectRootReferenceAndExtractValue(e: Expression): Seq[Expression] = e match { + case _: AttributeReference => Seq(e) + case GetStructField(_: ExtractValue | _: AttributeReference, _, _) => Seq(e) + case GetArrayStructFields(_: MapValues | + _: MapKeys | + _: ExtractValue | + _: AttributeReference, _, _, _, _) => Seq(e) + case es if es.children.nonEmpty => es.children.flatMap(collectRootReferenceAndExtractValue) case _ => Seq.empty } /** * Return two maps in order to replace nested fields to aliases. * - * 1. GetStructField -> Alias: A new alias is created for each nested field. + * 1. ExtractValue -> Alias: A new alias is created for each nested field. * 2. ExprId -> Seq[Alias]: A reference attribute has multiple aliases pointing it. */ private def getAliasSubMap(projectList: Seq[NamedExpression]) - : Option[(Map[GetStructField, Alias], Map[ExprId, Seq[Alias]])] = { + : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = { val (nestedFieldReferences, otherRootReferences) = - projectList.flatMap(collectRootReferenceAndGetStructField).partition { - case _: GetStructField => true + projectList.flatMap(collectRootReferenceAndExtractValue).partition { + case _: ExtractValue => true case _ => false } - val aliasSub = nestedFieldReferences.asInstanceOf[Seq[GetStructField]] + val aliasSub = nestedFieldReferences.asInstanceOf[Seq[ExtractValue]] .filter(!_.references.subsetOf(AttributeSet(otherRootReferences))) .groupBy(_.references.head) - .flatMap { case (attr, nestedFields: Seq[GetStructField]) => + .flatMap { case (attr, nestedFields: Seq[ExtractValue]) => // Each expression can contain multiple nested fields. // Note that we keep the original names to deliver to parquet in a case-sensitive way. val nestedFieldToAlias = nestedFields.distinct.map { f => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala index abb340e5c7510..ab2bd6dff1265 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor -import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.types.{StringType, StructField, StructType} class NestedColumnAliasingSuite extends SchemaPruningTest { @@ -221,6 +221,51 @@ class NestedColumnAliasingSuite extends SchemaPruningTest { comparePlans(optimized, expected) } + test("nested field pruning for getting struct field in array of struct") { + val field1 = GetArrayStructFields(child = 'friends, + field = StructField("first", StringType), + ordinal = 0, + numFields = 3, + containsNull = true) + val field2 = GetStructField('employer, 0, Some("id")) + + val query = contact + .limit(5) + .select(field1, field2) + .analyze + + val optimized = Optimize.execute(query) + + val expected = contact + .select(field1, field2) + .limit(5) + .analyze + comparePlans(optimized, expected) + } + + test("nested field pruning for getting struct field in map") { + val field1 = GetStructField(GetMapValue('relatives, Literal("key")), 0, Some("first")) + val field2 = GetArrayStructFields(child = MapValues('relatives), + field = StructField("middle", StringType), + ordinal = 1, + numFields = 3, + containsNull = true) + + val query = contact + .limit(5) + .select(field1, field2) + .analyze + + val optimized = Optimize.execute(query) + + val expected = contact + .select(field1, field2) + .limit(5) + .analyze + comparePlans(optimized, expected) + } + + private def collectGeneratedAliases(query: LogicalPlan): ArrayBuffer[String] = { val aliases = ArrayBuffer[String]() query.transformAllExpressions { diff --git a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt index 53fbf14d500de..765193d6c6436 100644 --- a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt @@ -2,46 +2,52 @@ Nested Schema Pruning Benchmark For ORC v1 ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 131 150 25 7.7 130.6 1.0X -Nested column 922 954 21 1.1 922.2 0.1X +Top-level column 127 163 24 7.9 127.1 1.0X +Nested column 974 1023 39 1.0 974.2 0.1X +Nested column in array 4834 4857 23 0.2 4834.1 0.0X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 446 477 50 2.2 445.5 1.0X -Nested column 1328 1366 44 0.8 1328.4 0.3X +Top-level column 454 488 45 2.2 454.3 1.0X +Nested column 1539 1602 80 0.6 1539.3 0.3X +Nested column in array 5765 5848 69 0.2 5764.7 0.1X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 357 386 33 2.8 356.8 1.0X -Nested column 1266 1274 7 0.8 1266.3 0.3X +Top-level column 365 395 58 2.7 364.9 1.0X +Nested column 1456 1477 23 0.7 1456.0 0.3X +Nested column in array 5734 5842 91 0.2 5734.4 0.1X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 368 394 54 2.7 367.6 1.0X -Nested column 3890 3954 80 0.3 3890.3 0.1X +Top-level column 373 387 15 2.7 372.8 1.0X +Nested column 4349 4397 59 0.2 4348.8 0.1X +Nested column in array 8893 8971 73 0.1 8893.2 0.0X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 129 140 10 7.7 129.1 1.0X -Nested column 966 999 26 1.0 966.2 0.1X +Top-level column 130 159 24 7.7 129.9 1.0X +Nested column 1160 1216 50 0.9 1159.8 0.1X +Nested column in array 5297 5420 176 0.2 5296.8 0.0X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 573 601 61 1.7 573.2 1.0X -Nested column 4417 4598 149 0.2 4417.1 0.1X +Top-level column 585 615 60 1.7 585.5 1.0X +Nested column 4972 5213 156 0.2 4972.2 0.1X +Nested column in array 10095 10156 32 0.1 10095.4 0.1X diff --git a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt index 1bb8a68c19d9a..fdd347f4bad9b 100644 --- a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt @@ -2,46 +2,52 @@ Nested Schema Pruning Benchmark For ORC v2 ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 117 144 21 8.5 117.0 1.0X -Nested column 1114 1138 23 0.9 1114.2 0.1X +Top-level column 122 161 29 8.2 121.9 1.0X +Nested column 1255 1279 23 0.8 1255.4 0.1X +Nested column in array 5352 5393 37 0.2 5352.3 0.0X -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 134 154 17 7.5 134.0 1.0X -Nested column 1156 1220 51 0.9 1156.2 0.1X +Top-level column 132 162 32 7.6 131.8 1.0X +Nested column 1246 1286 32 0.8 1245.6 0.1X +Nested column in array 5395 5542 143 0.2 5394.9 0.0X -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 350 359 6 2.9 350.5 1.0X -Nested column 1426 1443 13 0.7 1426.5 0.2X +Top-level column 385 403 20 2.6 385.4 1.0X +Nested column 1663 1691 52 0.6 1663.2 0.2X +Nested column in array 6264 6335 73 0.2 6264.4 0.1X -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 357 363 5 2.8 356.9 1.0X -Nested column 4186 4245 71 0.2 4186.3 0.1X +Top-level column 392 422 58 2.5 392.2 1.0X +Nested column 4104 4153 57 0.2 4104.0 0.1X +Nested column in array 8668 8748 55 0.1 8668.3 0.0X -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 115 134 13 8.7 114.6 1.0X -Nested column 1148 1199 59 0.9 1148.3 0.1X +Top-level column 130 146 22 7.7 130.1 1.0X +Nested column 1127 1166 53 0.9 1127.3 0.1X +Nested column in array 4906 4968 40 0.2 4905.8 0.0X -OpenJDK 64-Bit Server VM 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12 on Linux 4.15.0-1021-aws +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 274 279 3 3.6 274.1 1.0X -Nested column 3133 3254 145 0.3 3132.7 0.1X +Top-level column 291 308 25 3.4 290.5 1.0X +Nested column 3016 3091 58 0.3 3016.0 0.1X +Nested column in array 7730 7821 140 0.1 7729.5 0.0X diff --git a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt index bd5b39ad8d4ef..4e0c368b5370e 100644 --- a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt @@ -2,46 +2,52 @@ Nested Schema Pruning Benchmark For Parquet ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 128 166 24 7.8 128.0 1.0X -Nested column 308 325 10 3.2 308.3 0.4X +Top-level column 151 174 16 6.6 151.3 1.0X +Nested column 316 375 88 3.2 315.7 0.5X +Nested column in array 1277 1292 11 0.8 1277.0 0.1X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 447 496 91 2.2 447.0 1.0X -Nested column 631 666 40 1.6 631.2 0.7X +Top-level column 452 501 67 2.2 451.9 1.0X +Nested column 664 722 77 1.5 664.4 0.7X +Nested column in array 1906 1997 91 0.5 1905.6 0.2X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 360 394 84 2.8 360.0 1.0X -Nested column 553 586 65 1.8 553.5 0.7X +Top-level column 385 410 39 2.6 385.5 1.0X +Nested column 612 620 10 1.6 611.9 0.6X +Nested column in array 1790 1845 80 0.6 1789.5 0.2X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 368 393 50 2.7 368.3 1.0X -Nested column 2942 3017 82 0.3 2942.0 0.1X +Top-level column 386 402 26 2.6 386.0 1.0X +Nested column 2982 3057 64 0.3 2982.0 0.1X +Nested column in array 3504 3690 248 0.3 3503.7 0.1X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 124 143 10 8.1 124.1 1.0X -Nested column 345 366 34 2.9 344.8 0.4X +Top-level column 138 152 10 7.2 138.3 1.0X +Nested column 345 369 16 2.9 344.8 0.4X +Nested column in array 1358 1405 50 0.7 1358.5 0.1X -OpenJDK 64-Bit Server VM 1.8.0_201-b09 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_212-b04 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 577 618 55 1.7 576.8 1.0X -Nested column 3473 3524 49 0.3 3473.0 0.2X +Top-level column 606 632 45 1.6 606.3 1.0X +Nested column 3586 3679 107 0.3 3585.8 0.2X +Nested column in array 4452 4831 244 0.2 4451.8 0.1X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala index 9ffb3f9328229..96f90f29707d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala @@ -33,13 +33,17 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { protected val N = 1000000 protected val numIters = 10 - // We use `col1 BIGINT, col2 STRUCT<_1: BIGINT, _2: STRING>` as a test schema. - // col1 and col2._1 is used for comparision. col2._2 mimics the burden for the other columns + // We use `col1 BIGINT, col2 STRUCT<_1: BIGINT, _2: STRING>, + // col3 ARRAY>` as a test schema. + // col1, col2._1 and col3._1 are used for comparision. col2._2 and col3._2 mimics the burden + // for the other columns private val df = spark .range(N * 10) .sample(false, 0.1) - .map(x => (x, (x, s"$x" * 100))) - .toDF("col1", "col2") + .map { x => + val col3 = (0 until 5).map(i => (x + i, s"$x" * 5)) + (x, (x, s"$x" * 100), col3) + }.toDF("col1", "col2", "col3") private def addCase(benchmark: Benchmark, name: String, sql: String): Unit = { benchmark.addCase(name) { _ => @@ -51,7 +55,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -60,6 +64,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { addCase(benchmark, "Top-level column", "SELECT col1 FROM (SELECT col1 FROM t1)") addCase(benchmark, "Nested column", "SELECT col2._1 FROM (SELECT col2 FROM t2)") + addCase(benchmark, "Nested column in array", "SELECT col3._1 FROM (SELECT col3 FROM t3)") benchmark.run() } @@ -69,7 +74,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -80,6 +85,8 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { s"SELECT col1 FROM (SELECT col1 FROM t1 LIMIT ${Int.MaxValue})") addCase(benchmark, "Nested column", s"SELECT col2._1 FROM (SELECT col2 FROM t2 LIMIT ${Int.MaxValue})") + addCase(benchmark, "Nested column in array", + s"SELECT col3._1 FROM (SELECT col3 FROM t3 LIMIT ${Int.MaxValue})") benchmark.run() } @@ -89,7 +96,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -100,6 +107,8 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { s"SELECT col1 FROM (SELECT /*+ REPARTITION(1) */ col1 FROM t1)") addCase(benchmark, "Nested column", s"SELECT col2._1 FROM (SELECT /*+ REPARTITION(1) */ col2 FROM t2)") + addCase(benchmark, "Nested column in array", + s"SELECT col3._1 FROM (SELECT /*+ REPARTITION(1) */ col3 FROM t3)") benchmark.run() } @@ -109,7 +118,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -120,6 +129,8 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { s"SELECT col1 FROM (SELECT col1 FROM t1 DISTRIBUTE BY col1)") addCase(benchmark, "Nested column", s"SELECT col2._1 FROM (SELECT col2 FROM t2 DISTRIBUTE BY col2._1)") + addCase(benchmark, "Nested column in array", + s"SELECT col3._1 FROM (SELECT col3 FROM t3 DISTRIBUTE BY col3._1)") benchmark.run() } @@ -129,7 +140,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -140,6 +151,8 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { s"SELECT col1 FROM (SELECT col1 FROM t1 TABLESAMPLE(100 percent))") addCase(benchmark, "Nested column", s"SELECT col2._1 FROM (SELECT col2 FROM t2 TABLESAMPLE(100 percent))") + addCase(benchmark, "Nested column in array", + s"SELECT col3._1 FROM (SELECT col3 FROM t3 TABLESAMPLE(100 percent))") benchmark.run() } @@ -149,7 +162,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { withTempPath { dir => val path = dir.getCanonicalPath - Seq(1, 2).foreach { i => + Seq(1, 2, 3).foreach { i => df.write.format(dataSourceName).save(path + s"/$i") spark.read.format(dataSourceName).load(path + s"/$i").createOrReplaceTempView(s"t$i") } @@ -158,6 +171,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { addCase(benchmark, "Top-level column", "SELECT col1 FROM t1 ORDER BY col1") addCase(benchmark, "Nested column", "SELECT col2._1 FROM t2 ORDER BY col2._1") + addCase(benchmark, "Nested column in array", "SELECT col3._1 FROM t3 ORDER BY col3._1") benchmark.run() }