[KYUUBI #3209] Support configure TPC-H connector in runtime

pan3793 · pan3793 · commit c9cc9b7e5f9f · 2022-08-09T22:31:45.000+08:00
### _Why are the changes needed?_ This PR proposes to introduce `TPCHConf` to support configure TPC-H connector in runtime, just like TPC-DS connector ### _How was this patch tested?_ - [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible - [ ] Add screenshots for manual tests if appropriate - [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request Closes #3209 from pan3793/tpch-conf. Closes #3209 af45166 [Cheng Pan] nit bbc1cba [Cheng Pan] Support configure TPC-H connector in runtime Authored-by: Cheng Pan <chengpan@apache.org> Signed-off-by: Cheng Pan <chengpan@apache.org>
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHBatchScan.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHBatchScan.scala
@@ -42,20 +42,19 @@ case class TPCHTableChuck(table: String, scale: Double, parallelism: Int, index:
 class TPCHBatchScan(
     @transient table: TpchTable[_],
     scale: Double,
-    schema: StructType) extends ScanBuilder
+    schema: StructType,
+    readConf: TPCHReadConf) extends ScanBuilder
   with SupportsReportStatistics with Batch with Serializable {
 
   private val _sizeInBytes: Long = TPCHStatisticsUtils.sizeInBytes(table, scale)
 
   private val _numRows: Long = TPCHStatisticsUtils.numRows(table, scale)
 
-  private val rowCountPerTask: Int = 1000000
-
   private val parallelism: Int =
     if (table.equals(TpchTable.NATION) || table.equals(TpchTable.REGION)) 1
     else math.max(
       SparkSession.active.sparkContext.defaultParallelism,
-      (_numRows / rowCountPerTask.toDouble).ceil.toInt)
+      (_numRows / readConf.maxPartitionBytes).ceil.toInt)
 
   override def build: Scan = this
 
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalog.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalog.scala
@@ -22,6 +22,7 @@ import java.util
 import scala.collection.JavaConverters._
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchTableException}
 import org.apache.spark.sql.connector.catalog.{Identifier, NamespaceChange, SupportsNamespaces, Table => SparkTable, TableCatalog, TableChange}
 import org.apache.spark.sql.connector.expressions.Transform
@@ -34,17 +35,16 @@ class TPCHCatalog extends TableCatalog with SupportsNamespaces with Logging {
 
   val tables: Array[String] = TPCHSchemaUtils.BASE_TABLES.map(_.getTableName)
 
-  var options: CaseInsensitiveStringMap = _
+  var tpchConf: TPCHConf = _
 
   var _name: String = _
 
   override def name: String = _name
 
   override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = {
     this._name = name
-    this.options = options
-    val uncheckedExcludeDatabases = options.getOrDefault("excludeDatabases", "")
-      .split(",").map(_.toLowerCase.trim).filter(_.nonEmpty)
+    this.tpchConf = TPCHConf(SparkSession.active, options)
+    val uncheckedExcludeDatabases = tpchConf.excludeDatabases
     val invalidExcludeDatabases = uncheckedExcludeDatabases diff TPCHSchemaUtils.DATABASES
     if (invalidExcludeDatabases.nonEmpty) {
       logWarning(
@@ -64,7 +64,7 @@ class TPCHCatalog extends TableCatalog with SupportsNamespaces with Logging {
   override def loadTable(ident: Identifier): SparkTable = (ident.namespace, ident.name) match {
     case (Array(db), table) if (databases contains db) && tables.contains(table.toLowerCase) =>
       val scale = TPCHSchemaUtils.scale(db)
-      new TPCHTable(table.toLowerCase, scale, options)
+      new TPCHTable(table.toLowerCase, scale, tpchConf)
     case (_, _) => throw new NoSuchTableException(ident)
   }
 
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHConf.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHConf.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.tpch
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.connector.catalog.Table
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+
+import org.apache.kyuubi.spark.connector.common.SparkConfParser
+import org.apache.kyuubi.spark.connector.tpch.TPCHConf._
+
+case class TPCHConf(spark: SparkSession, options: CaseInsensitiveStringMap) {
+
+  private val confParser: SparkConfParser = SparkConfParser(options, spark.conf, null)
+
+  lazy val excludeDatabases: Array[String] = confParser.stringConf()
+    .option(EXCLUDE_DATABASES)
+    .parseOptional()
+    .map(_.split(",").map(_.toLowerCase.trim).filter(_.nonEmpty))
+    .getOrElse(Array.empty)
+
+  // When true, use CHAR VARCHAR; otherwise use STRING
+  lazy val useAnsiStringType: Boolean = confParser.booleanConf()
+    .option(USE_ANSI_STRING_TYPE)
+    .sessionConf(s"$TPCH_CONNECTOR_CONF_PREFIX.$USE_ANSI_STRING_TYPE")
+    .defaultValue(USE_ANSI_STRING_TYPE_DEFAULT)
+    .parse()
+}
+
+case class TPCHReadConf(
+    spark: SparkSession,
+    table: Table,
+    options: CaseInsensitiveStringMap) {
+
+  private val confParser: SparkConfParser =
+    SparkConfParser(options, spark.conf, table.properties)
+
+  lazy val maxPartitionBytes: Long = confParser.longConf()
+    .option(MAX_PARTITION_BYTES_CONF)
+    .sessionConf(s"$TPCH_CONNECTOR_READ_CONF_PREFIX.$MAX_PARTITION_BYTES_CONF")
+    .tableProperty(s"$TPCH_CONNECTOR_READ_CONF_PREFIX.$MAX_PARTITION_BYTES_CONF")
+    .defaultValue(MAX_PARTITION_BYTES_DEFAULT)
+    .parse()
+}
+
+object TPCHConf {
+  val EXCLUDE_DATABASES = "excludeDatabases"
+
+  val TPCH_CONNECTOR_CONF_PREFIX = "spark.connector.tpch"
+  val USE_ANSI_STRING_TYPE = "useAnsiStringType"
+  val USE_ANSI_STRING_TYPE_DEFAULT = false
+
+  val TPCH_CONNECTOR_READ_CONF_PREFIX = s"$TPCH_CONNECTOR_CONF_PREFIX.read"
+  val MAX_PARTITION_BYTES_CONF = "maxPartitionBytes"
+  val MAX_PARTITION_BYTES_DEFAULT: Long = 128 * 1024 * 1024L
+}
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHStatisticsUtils.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHStatisticsUtils.scala
@@ -22,7 +22,7 @@ import io.trino.tpch.TpchTable._
 
 import org.apache.kyuubi.spark.connector.tpch.TPCHSchemaUtils.{normalize, SCALES}
 
-// https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v3.0.0.pdf
+// https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v3.0.1.pdf
 // Page 88 Table 3: Estimated Database Size
 object TPCHStatisticsUtils {
 
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHTable.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHTable.scala
@@ -25,17 +25,15 @@ import scala.collection.convert.ImplicitConversions.`list asScalaBuffer`
 
 import io.trino.tpch.{TpchColumnType, TpchEntity, TpchTable}
 import io.trino.tpch.TpchColumnType.Base._
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.connector.catalog.{SupportsRead, Table => SparkTable, TableCapability}
 import org.apache.spark.sql.connector.read.ScanBuilder
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
-class TPCHTable(tbl: String, scale: Double, options: CaseInsensitiveStringMap)
+class TPCHTable(tbl: String, scale: Double, tpchConf: TPCHConf)
   extends SparkTable with SupportsRead {
 
-  // When true, use CHAR VARCHAR; otherwise use STRING
-  val useAnsiStringType: Boolean = options.getBoolean("useAnsiStringType", false)
-
   val tpchTable: TpchTable[_] = TpchTable.getTable(tbl)
 
   override def name: String = s"${TPCHSchemaUtils.dbName(scale)}.$tbl"
@@ -53,7 +51,7 @@ class TPCHTable(tbl: String, scale: Double, options: CaseInsensitiveStringMap)
     Set(TableCapability.BATCH_READ).asJava
 
   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
-    new TPCHBatchScan(tpchTable, scale, schema)
+    new TPCHBatchScan(tpchTable, scale, schema, TPCHReadConf(SparkSession.active, this, options))
   }
 
   def toSparkDataType(tpchType: TpchColumnType): DataType = {
@@ -63,7 +61,7 @@ class TPCHTable(tbl: String, scale: Double, options: CaseInsensitiveStringMap)
       case (DOUBLE, None, None) => DoubleType
       case (DATE, None, None) => DateType
       case (VARCHAR, Some(precision), None) =>
-        if (useAnsiStringType) VarcharType(precision.toInt) else StringType
+        if (tpchConf.useAnsiStringType) VarcharType(precision.toInt) else StringType
       case (t, po, so) =>
         throw new IllegalArgumentException(s"Unsupported TPC-H type: ($t, $po, $so)")
     }
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalogSuite.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalogSuite.scala
@@ -27,22 +27,20 @@ import org.apache.kyuubi.spark.connector.common.SparkUtils
 
 class TPCHCatalogSuite extends KyuubiFunSuite {
 
-  protected lazy val spark: SparkSession = {
-    SparkSession.builder()
-      .master("local[*]")
-      .config("spark.ui.enabled", "false")
-      .config("spark.sql.catalogImplementation", "in-memory")
-      .config("spark.sql.catalog.tpch", classOf[TPCHCatalog].getName)
-      .config("spark.sql.cbo.enabled", "true")
-      .config("spark.sql.cbo.planStats.enabled", "true")
-      .getOrCreate()
-  }
-
   test("get catalog name") {
-    val catalog = new TPCHCatalog
-    val catalogName = "test"
-    catalog.initialize(catalogName, CaseInsensitiveStringMap.empty())
-    assert(catalog._name == catalogName)
+    val sparkConf = new SparkConf()
+      .setMaster("local[*]")
+      .set("spark.ui.enabled", "false")
+      .set("spark.sql.catalogImplementation", "in-memory")
+      .set("spark.sql.catalog.tpch", classOf[TPCHCatalog].getName)
+      .set("spark.sql.cbo.enabled", "true")
+      .set("spark.sql.cbo.planStats.enabled", "true")
+    withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
+      val catalog = new TPCHCatalog
+      val catalogName = "test"
+      catalog.initialize(catalogName, CaseInsensitiveStringMap.empty())
+      assert(catalog._name == catalogName)
+    }
   }
 
   test("supports namespaces") {