[KYUUBI #1974] Support merge small files in multi insert statement

pan3793 · pan3793 · commit a3d9ca31d3cb · 2022-02-24T21:10:46.000+08:00
### _Why are the changes needed?_ This PR aims to support auto merge small files in multi insert statement, for example `FROM VALUES(1) INSERT INTO tmp1 SELECT * INSERT INTO tmp2 SELECT *;` will generate the following plan, `Union` is the root node instead of `InsertIntoHiveTable` ``` Union :- InsertIntoHiveTable : +- Project : +- LocalRelation +- InsertIntoHiveTable +- Project +- LocalRelation ``` This PR also fixed the `canInsertRepartitionByExpression`, previous it did not consider the `SubqueryAlias` which may cause inserting error `Repartition`/`Reblance` node and currupt the data distribution, e.g. `FROM (SELECT * FROM VALUES(1) DOSTRIBUTE BY col1) INSERT INTO tmp1 SELECT * INSERT INTO tmp2 SELECT *;` ``` Union :- InsertIntoHiveTable : +- Project : +- SubqueryAlias : +- RepartitionByExpression : +- Project : +- LocalRelation +- InsertIntoHiveTable +- Project +- SubqueryAlias +- RepartitionByExpression +- Project +- LocalRelation ``` ### _How was this patch tested?_ - [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible - [ ] Add screenshots for manual tests if appropriate - [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request Closes #1974 from pan3793/ext. Closes #1974 56cd773 [Cheng Pan] nit e0155c2 [Cheng Pan] Support merge small files in multi table insertion Authored-by: Cheng Pan <chengpan@apache.org> Signed-off-by: Cheng Pan <chengpan@apache.org>
diff --git a/dev/kyuubi-extension-spark-3-1/src/test/scala/org/apache/spark/sql/RepartitionBeforeWritingSuite.scala b/dev/kyuubi-extension-spark-3-1/src/test/scala/org/apache/spark/sql/RepartitionBeforeWritingSuite.scala
@@ -26,14 +26,14 @@ import org.apache.kyuubi.sql.KyuubiSQLConf
 
 class RepartitionBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
   test("check repartition exists") {
-    def check(df: DataFrame): Unit = {
+    def check(df: DataFrame, expectedRepartitionNum: Int = 1): Unit = {
       assert(
         df.queryExecution.analyzed.collect {
           case r: RepartitionByExpression =>
             assert(r.optNumPartitions ===
               spark.sessionState.conf.getConf(KyuubiSQLConf.INSERT_REPARTITION_NUM))
             r
-        }.size == 1)
+        }.size == expectedRepartitionNum)
     }
 
     // It's better to set config explicitly in case of we change the default value.
@@ -45,13 +45,44 @@ class RepartitionBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
             "SELECT * FROM VALUES(1),(2) AS t(c1)"))
         }
 
+        withTable("tmp1", "tmp2") {
+          sql(s"CREATE TABLE tmp1 (c1 int) $storage PARTITIONED BY (c2 string)")
+          sql(s"CREATE TABLE tmp2 (c1 int) $storage PARTITIONED BY (c2 string)")
+          check(
+            sql(
+              """FROM VALUES(1),(2) AS t(c1)
+                |INSERT INTO TABLE tmp1 PARTITION(c2='a') SELECT *
+                |INSERT INTO TABLE tmp2 PARTITION(c2='a') SELECT *
+                |""".stripMargin),
+            2)
+        }
+
         withTable("tmp1") {
           sql(s"CREATE TABLE tmp1 (c1 int) $storage")
           check(sql("INSERT INTO TABLE tmp1 SELECT * FROM VALUES(1),(2),(3) AS t(c1)"))
           check(sql("INSERT INTO TABLE tmp1 " +
             "SELECT * FROM VALUES(1),(2),(3) AS t(c1) DISTRIBUTE BY c1"))
         }
 
+        withTable("tmp1", "tmp2") {
+          sql(s"CREATE TABLE tmp1 (c1 int) $storage")
+          sql(s"CREATE TABLE tmp2 (c1 int) $storage")
+          check(
+            sql(
+              """FROM VALUES(1),(2),(3)
+                |INSERT INTO TABLE tmp1 SELECT *
+                |INSERT INTO TABLE tmp2 SELECT *
+                |""".stripMargin),
+            2)
+          check(
+            sql(
+              """FROM (SELECT * FROM VALUES(1),(2),(3) AS t(c1) DISTRIBUTE BY c1)
+                |INSERT INTO TABLE tmp1 SELECT *
+                |INSERT INTO TABLE tmp2 SELECT *
+                |""".stripMargin),
+            2)
+        }
+
         withTable("tmp1") {
           sql(s"CREATE TABLE tmp1 $storage AS SELECT * FROM VALUES(1),(2),(3) AS t(c1)")
         }
diff --git a/dev/kyuubi-extension-spark-3-2/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala b/dev/kyuubi-extension-spark-3-2/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala
@@ -26,11 +26,11 @@ import org.apache.kyuubi.sql.KyuubiSQLConf
 
 class RebalanceBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
   test("check rebalance exists") {
-    def check(df: DataFrame): Unit = {
+    def check(df: DataFrame, expectedRebalanceNum: Int = 1): Unit = {
       assert(
         df.queryExecution.analyzed.collect {
           case r: RebalancePartitions => r
-        }.size == 1)
+        }.size == expectedRebalanceNum)
     }
 
     // It's better to set config explicitly in case of we change the default value.
@@ -42,11 +42,35 @@ class RebalanceBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
             "SELECT * FROM VALUES(1),(2) AS t(c1)"))
         }
 
+        withTable("tmp1", "tmp2") {
+          sql(s"CREATE TABLE tmp1 (c1 int) $storage PARTITIONED BY (c2 string)")
+          sql(s"CREATE TABLE tmp2 (c1 int) $storage PARTITIONED BY (c2 string)")
+          check(
+            sql(
+              """FROM VALUES(1),(2)
+                |INSERT INTO TABLE tmp1 PARTITION(c2='a') SELECT *
+                |INSERT INTO TABLE tmp2 PARTITION(c2='a') SELECT *
+                |""".stripMargin),
+            2)
+        }
+
         withTable("tmp1") {
           sql(s"CREATE TABLE tmp1 (c1 int) $storage")
           check(sql("INSERT INTO TABLE tmp1 SELECT * FROM VALUES(1),(2),(3) AS t(c1)"))
         }
 
+        withTable("tmp1", "tmp2") {
+          sql(s"CREATE TABLE tmp1 (c1 int) $storage")
+          sql(s"CREATE TABLE tmp2 (c1 int) $storage")
+          check(
+            sql(
+              """FROM VALUES(1),(2),(3)
+                |INSERT INTO TABLE tmp1 SELECT *
+                |INSERT INTO TABLE tmp2 SELECT *
+                |""".stripMargin),
+            2)
+        }
+
         withTable("tmp1") {
           sql(s"CREATE TABLE tmp1 $storage AS SELECT * FROM VALUES(1),(2),(3) AS t(c1)")
         }
diff --git a/dev/kyuubi-extension-spark-common/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala b/dev/kyuubi-extension-spark-common/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala
@@ -59,6 +59,9 @@ abstract class RepartitionBeforeWritingDatasourceBase extends RepartitionBuilder
         query.output.filter(attr => table.partitionColumnNames.contains(attr.name))
       c.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case u @ Union(children, _, _) =>
+      u.copy(children = children.map(addRepartition))
+
     case _ => plan
   }
 }
@@ -98,13 +101,17 @@ abstract class RepartitionBeforeWritingHiveBase extends RepartitionBuilder {
         query.output.filter(attr => table.partitionColumnNames.contains(attr.name))
       c.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case u @ Union(children, _, _) =>
+      u.copy(children = children.map(addRepartition))
+
     case _ => plan
   }
 }
 
 trait RepartitionBeforeWriteHelper {
   def canInsertRepartitionByExpression(plan: LogicalPlan): Boolean = plan match {
     case Project(_, child) => canInsertRepartitionByExpression(child)
+    case SubqueryAlias(_, child) => canInsertRepartitionByExpression(child)
     case Limit(_, _) => false
     case _: Sort => false
     case _: RepartitionByExpression => false

Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,9 @@ abstract class RepartitionBeforeWritingDatasourceBase extends RepartitionBuilder`
`59`	`59`	`query.output.filter(attr => table.partitionColumnNames.contains(attr.name))`
`60`	`60`	`c.copy(query = buildRepartition(dynamicPartitionColumns, query))`
`61`	`61`
	`62`	`+ case u @ Union(children, _, _) =>`
	`63`	`+ u.copy(children = children.map(addRepartition))`
	`64`	`+`
`62`	`65`	`case _ => plan`
`63`	`66`	`}`
`64`	`67`	`}`
`@@ -98,13 +101,17 @@ abstract class RepartitionBeforeWritingHiveBase extends RepartitionBuilder {`
`98`	`101`	`query.output.filter(attr => table.partitionColumnNames.contains(attr.name))`
`99`	`102`	`c.copy(query = buildRepartition(dynamicPartitionColumns, query))`
`100`	`103`
	`104`	`+ case u @ Union(children, _, _) =>`
	`105`	`+ u.copy(children = children.map(addRepartition))`
	`106`	`+`
`101`	`107`	`case _ => plan`
`102`	`108`	`}`
`103`	`109`	`}`
`104`	`110`
`105`	`111`	`trait RepartitionBeforeWriteHelper {`
`106`	`112`	`def canInsertRepartitionByExpression(plan: LogicalPlan): Boolean = plan match {`
`107`	`113`	`case Project(_, child) => canInsertRepartitionByExpression(child)`
	`114`	`+ case SubqueryAlias(_, child) => canInsertRepartitionByExpression(child)`
`108`	`115`	`case Limit(_, _) => false`
`109`	`116`	`case _: Sort => false`
`110`	`117`	`case _: RepartitionByExpression => false`