|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.kyuubi.sql.zorder |
| 19 | + |
| 20 | +import org.apache.spark.sql.SparkSession |
| 21 | +import org.apache.spark.sql.catalyst.catalog.CatalogTable |
| 22 | +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, NullsLast, SortOrder} |
| 23 | +import org.apache.spark.sql.catalyst.plans.logical._ |
| 24 | +import org.apache.spark.sql.catalyst.rules.Rule |
| 25 | +import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand |
| 26 | +import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand |
| 27 | +import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveTable, OptimizedCreateHiveTableAsSelectCommand} |
| 28 | + |
| 29 | +import org.apache.kyuubi.sql.{KyuubiSQLConf, KyuubiSQLExtensionException} |
| 30 | + |
| 31 | +trait InsertZorderHelper33 extends Rule[LogicalPlan] with ZorderBuilder { |
| 32 | + private val KYUUBI_ZORDER_ENABLED = "kyuubi.zorder.enabled" |
| 33 | + private val KYUUBI_ZORDER_COLS = "kyuubi.zorder.cols" |
| 34 | + |
| 35 | + def isZorderEnabled(props: Map[String, String]): Boolean = { |
| 36 | + props.contains(KYUUBI_ZORDER_ENABLED) && |
| 37 | + "true".equalsIgnoreCase(props(KYUUBI_ZORDER_ENABLED)) && |
| 38 | + props.contains(KYUUBI_ZORDER_COLS) |
| 39 | + } |
| 40 | + |
| 41 | + def getZorderColumns(props: Map[String, String]): Seq[String] = { |
| 42 | + val cols = props.get(KYUUBI_ZORDER_COLS) |
| 43 | + assert(cols.isDefined) |
| 44 | + cols.get.split(",").map(_.trim) |
| 45 | + } |
| 46 | + |
| 47 | + def canInsertZorder(query: LogicalPlan): Boolean = query match { |
| 48 | + case Project(_, child) => canInsertZorder(child) |
| 49 | + // TODO: actually, we can force zorder even if existed some shuffle |
| 50 | + case _: Sort => false |
| 51 | + case _: RepartitionByExpression => false |
| 52 | + case _: Repartition => false |
| 53 | + case _ => true |
| 54 | + } |
| 55 | + |
| 56 | + def insertZorder( |
| 57 | + catalogTable: CatalogTable, |
| 58 | + plan: LogicalPlan, |
| 59 | + dynamicPartitionColumns: Seq[Attribute]): LogicalPlan = { |
| 60 | + if (!canInsertZorder(plan)) { |
| 61 | + return plan |
| 62 | + } |
| 63 | + val cols = getZorderColumns(catalogTable.properties) |
| 64 | + val resolver = session.sessionState.conf.resolver |
| 65 | + val output = plan.output |
| 66 | + val bound = cols.flatMap(col => output.find(attr => resolver(attr.name, col))) |
| 67 | + if (bound.size < cols.size) { |
| 68 | + logWarning(s"target table does not contain all zorder cols: ${cols.mkString(",")}, " + |
| 69 | + s"please check your table properties ${KYUUBI_ZORDER_COLS}.") |
| 70 | + plan |
| 71 | + } else { |
| 72 | + if (conf.getConf(KyuubiSQLConf.ZORDER_GLOBAL_SORT_ENABLED) && |
| 73 | + conf.getConf(KyuubiSQLConf.REBALANCE_BEFORE_ZORDER)) { |
| 74 | + throw new KyuubiSQLExtensionException(s"${KyuubiSQLConf.ZORDER_GLOBAL_SORT_ENABLED.key} " + |
| 75 | + s"and ${KyuubiSQLConf.REBALANCE_BEFORE_ZORDER.key} can not be enabled together.") |
| 76 | + } |
| 77 | + if (conf.getConf(KyuubiSQLConf.ZORDER_GLOBAL_SORT_ENABLED) && |
| 78 | + dynamicPartitionColumns.nonEmpty) { |
| 79 | + logWarning(s"Dynamic partition insertion with global sort may produce small files.") |
| 80 | + } |
| 81 | + |
| 82 | + val zorderExpr = |
| 83 | + if (bound.length == 1) { |
| 84 | + bound |
| 85 | + } else if (conf.getConf(KyuubiSQLConf.ZORDER_USING_ORIGINAL_ORDERING_ENABLED)) { |
| 86 | + bound.asInstanceOf[Seq[Expression]] |
| 87 | + } else { |
| 88 | + buildZorder(bound) :: Nil |
| 89 | + } |
| 90 | + val (global, orderExprs, child) = |
| 91 | + if (conf.getConf(KyuubiSQLConf.ZORDER_GLOBAL_SORT_ENABLED)) { |
| 92 | + (true, zorderExpr, plan) |
| 93 | + } else if (conf.getConf(KyuubiSQLConf.REBALANCE_BEFORE_ZORDER)) { |
| 94 | + val rebalanceExpr = |
| 95 | + if (dynamicPartitionColumns.isEmpty) { |
| 96 | + // static partition insert |
| 97 | + bound |
| 98 | + } else if (conf.getConf(KyuubiSQLConf.REBALANCE_ZORDER_COLUMNS_ENABLED)) { |
| 99 | + // improve data compression ratio |
| 100 | + dynamicPartitionColumns.asInstanceOf[Seq[Expression]] ++ bound |
| 101 | + } else { |
| 102 | + dynamicPartitionColumns.asInstanceOf[Seq[Expression]] |
| 103 | + } |
| 104 | + // for dynamic partition insert, Spark always sort the partition columns, |
| 105 | + // so here we sort partition columns + zorder. |
| 106 | + val rebalance = |
| 107 | + if (dynamicPartitionColumns.nonEmpty && |
| 108 | + conf.getConf(KyuubiSQLConf.TWO_PHASE_REBALANCE_BEFORE_ZORDER)) { |
| 109 | + // improve compression ratio |
| 110 | + RebalancePartitions( |
| 111 | + rebalanceExpr, |
| 112 | + RebalancePartitions(dynamicPartitionColumns, plan)) |
| 113 | + } else { |
| 114 | + RebalancePartitions(rebalanceExpr, plan) |
| 115 | + } |
| 116 | + (false, dynamicPartitionColumns.asInstanceOf[Seq[Expression]] ++ zorderExpr, rebalance) |
| 117 | + } else { |
| 118 | + (false, zorderExpr, plan) |
| 119 | + } |
| 120 | + val order = orderExprs.map { expr => |
| 121 | + SortOrder(expr, Ascending, NullsLast, Seq.empty) |
| 122 | + } |
| 123 | + Sort(order, global, child) |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + override def buildZorder(children: Seq[Expression]): ZorderBase = Zorder(children) |
| 128 | + |
| 129 | + def session: SparkSession |
| 130 | + def applyInternal(plan: LogicalPlan): LogicalPlan |
| 131 | + |
| 132 | + final override def apply(plan: LogicalPlan): LogicalPlan = { |
| 133 | + if (conf.getConf(KyuubiSQLConf.INSERT_ZORDER_BEFORE_WRITING)) { |
| 134 | + applyInternal(plan) |
| 135 | + } else { |
| 136 | + plan |
| 137 | + } |
| 138 | + } |
| 139 | +} |
| 140 | + |
| 141 | +case class InsertZorderBeforeWritingDatasource33(session: SparkSession) |
| 142 | + extends InsertZorderHelper33 { |
| 143 | + override def applyInternal(plan: LogicalPlan): LogicalPlan = plan match { |
| 144 | + case insert: InsertIntoHadoopFsRelationCommand |
| 145 | + if insert.query.resolved && |
| 146 | + insert.bucketSpec.isEmpty && insert.catalogTable.isDefined && |
| 147 | + isZorderEnabled(insert.catalogTable.get.properties) => |
| 148 | + val dynamicPartition = |
| 149 | + insert.partitionColumns.filterNot(attr => insert.staticPartitions.contains(attr.name)) |
| 150 | + val newQuery = insertZorder(insert.catalogTable.get, insert.query, dynamicPartition) |
| 151 | + if (newQuery.eq(insert.query)) { |
| 152 | + insert |
| 153 | + } else { |
| 154 | + insert.copy(query = newQuery) |
| 155 | + } |
| 156 | + |
| 157 | + case ctas: CreateDataSourceTableAsSelectCommand |
| 158 | + if ctas.query.resolved && |
| 159 | + ctas.table.bucketSpec.isEmpty && isZorderEnabled(ctas.table.properties) => |
| 160 | + val dynamicPartition = |
| 161 | + ctas.query.output.filter(attr => ctas.table.partitionColumnNames.contains(attr.name)) |
| 162 | + val newQuery = insertZorder(ctas.table, ctas.query, dynamicPartition) |
| 163 | + if (newQuery.eq(ctas.query)) { |
| 164 | + ctas |
| 165 | + } else { |
| 166 | + ctas.copy(query = newQuery) |
| 167 | + } |
| 168 | + |
| 169 | + case _ => plan |
| 170 | + } |
| 171 | +} |
| 172 | + |
| 173 | +case class InsertZorderBeforeWritingHive33(session: SparkSession) |
| 174 | + extends InsertZorderHelper33 { |
| 175 | + override def applyInternal(plan: LogicalPlan): LogicalPlan = plan match { |
| 176 | + case insert: InsertIntoHiveTable |
| 177 | + if insert.query.resolved && |
| 178 | + insert.table.bucketSpec.isEmpty && isZorderEnabled(insert.table.properties) => |
| 179 | + val dynamicPartition = insert.partition.filter(_._2.isEmpty).keys |
| 180 | + .flatMap(name => insert.query.output.find(_.name == name)).toSeq |
| 181 | + val newQuery = insertZorder(insert.table, insert.query, dynamicPartition) |
| 182 | + if (newQuery.eq(insert.query)) { |
| 183 | + insert |
| 184 | + } else { |
| 185 | + insert.copy(query = newQuery) |
| 186 | + } |
| 187 | + |
| 188 | + case ctas: CreateHiveTableAsSelectCommand |
| 189 | + if ctas.query.resolved && |
| 190 | + ctas.tableDesc.bucketSpec.isEmpty && isZorderEnabled(ctas.tableDesc.properties) => |
| 191 | + val dynamicPartition = |
| 192 | + ctas.query.output.filter(attr => ctas.tableDesc.partitionColumnNames.contains(attr.name)) |
| 193 | + val newQuery = insertZorder(ctas.tableDesc, ctas.query, dynamicPartition) |
| 194 | + if (newQuery.eq(ctas.query)) { |
| 195 | + ctas |
| 196 | + } else { |
| 197 | + ctas.copy(query = newQuery) |
| 198 | + } |
| 199 | + |
| 200 | + case octas: OptimizedCreateHiveTableAsSelectCommand |
| 201 | + if octas.query.resolved && |
| 202 | + octas.tableDesc.bucketSpec.isEmpty && isZorderEnabled(octas.tableDesc.properties) => |
| 203 | + val dynamicPartition = |
| 204 | + octas.query.output.filter(attr => octas.tableDesc.partitionColumnNames.contains(attr.name)) |
| 205 | + val newQuery = insertZorder(octas.tableDesc, octas.query, dynamicPartition) |
| 206 | + if (newQuery.eq(octas.query)) { |
| 207 | + octas |
| 208 | + } else { |
| 209 | + octas.copy(query = newQuery) |
| 210 | + } |
| 211 | + |
| 212 | + case _ => plan |
| 213 | + } |
| 214 | +} |
0 commit comments