[KYUUBI #2789] Kyuubi Spark TPC-H Connector - Add tiny scale

jiaoqingbo · pan3793 · commit 74ff5cf3b7e5 · 2022-05-31T20:21:02.000+08:00
### _Why are the changes needed?_ fix #2789 ### _How was this patch tested?_ - [x] Add some test cases that check the changes thoroughly including negative and positive cases if possible - [ ] Add screenshots for manual tests if appropriate - [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request Closes #2791 from jiaoqingbo/kyuubi-2789. Closes #2789 5f05691 [jiaoqingbo] [KYUUBI #2789] Kyuubi Spark TPC-H Connector - Add tiny scale Authored-by: jiaoqingbo <1178404354@qq.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHBatchScan.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHBatchScan.scala
@@ -32,12 +32,12 @@ import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-case class TPCHTableChuck(table: String, scale: Int, parallelism: Int, index: Int)
+case class TPCHTableChuck(table: String, scale: Double, parallelism: Int, index: Int)
   extends InputPartition
 
 class TPCHBatchScan(
     @transient table: TpchTable[_],
-    scale: Int,
+    scale: Double,
     schema: StructType) extends ScanBuilder
   with SupportsReportStatistics with Batch with Serializable {
 
@@ -58,7 +58,8 @@ class TPCHBatchScan(
   override def toBatch: Batch = this
 
   override def description: String =
-    s"Scan TPC-H sf$scale.${table.getTableName}, count: ${_numRows}, parallelism: $parallelism"
+    s"Scan TPC-H ${TPCHSchemaUtils.dbName(scale)}.${table.getTableName}, " +
+      s"count: ${_numRows}, parallelism: $parallelism"
 
   override def readSchema: StructType = schema
 
@@ -81,7 +82,7 @@ class TPCHBatchScan(
 
 class TPCHPartitionReader(
     table: String,
-    scale: Int,
+    scale: Double,
     parallelism: Int,
     index: Int,
     schema: StructType) extends PartitionReader[InternalRow] {
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalog.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalog.scala
@@ -21,7 +21,6 @@ import java.util
 
 import scala.collection.JavaConverters._
 
-import io.trino.tpch.TpchTable
 import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchTableException}
 import org.apache.spark.sql.connector.catalog.{Identifier, NamespaceChange, SupportsNamespaces, Table => SparkTable, TableCatalog, TableChange}
 import org.apache.spark.sql.connector.expressions.Transform
@@ -30,12 +29,9 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 class TPCHCatalog extends TableCatalog with SupportsNamespaces {
 
-  val tables: Array[String] = TpchTable.getTables.asScala
-    .map(_.getTableName).toArray
+  val databases: Array[String] = TPCHSchemaUtils.DATABASES
 
-  val scales: Array[Int] = TPCHStatisticsUtils.SCALES
-
-  val databases: Array[String] = scales.map("sf" + _)
+  val tables: Array[String] = TPCHSchemaUtils.BASE_TABLES.map(_.getTableName)
 
   var options: CaseInsensitiveStringMap = _
 
@@ -55,7 +51,8 @@ class TPCHCatalog extends TableCatalog with SupportsNamespaces {
 
   override def loadTable(ident: Identifier): SparkTable = (ident.namespace, ident.name) match {
     case (Array(db), table) if (databases contains db) && tables.contains(table.toLowerCase) =>
-      new TPCHTable(table.toLowerCase, scales(databases indexOf db), options)
+      val scale = TPCHSchemaUtils.scale(db)
+      new TPCHTable(table.toLowerCase, scale, options)
     case (_, _) => throw new NoSuchTableException(ident)
   }
 
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHSchemaUtils.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHSchemaUtils.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.tpch
+
+import java.text.DecimalFormat
+
+import scala.collection.JavaConverters._
+
+import io.trino.tpch.TpchTable
+
+object TPCHSchemaUtils {
+
+  val TINY_SCALE = "0.01"
+
+  val SCALES: Array[String] =
+    Array(
+      "0",
+      TINY_SCALE,
+      "1",
+      "10",
+      "30",
+      "100",
+      "300",
+      "1000",
+      "3000",
+      "10000",
+      "30000",
+      "100000")
+
+  val TINY_DB_NAME = "tiny"
+
+  val DATABASES: Array[String] = SCALES.map {
+    case TINY_SCALE => TINY_DB_NAME
+    case scale => s"sf$scale"
+  }
+
+  def normalize(scale: Double): String = new DecimalFormat("#.##").format(scale)
+
+  def scale(dbName: String): Double = SCALES(DATABASES.indexOf(dbName)).toDouble
+
+  def dbName(scale: Double): String = DATABASES(SCALES.indexOf(normalize(scale)))
+
+  val BASE_TABLES: Array[TpchTable[_]] = TpchTable.getTables.asScala.toArray
+
+}
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHStatisticsUtils.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHStatisticsUtils.scala
@@ -20,37 +20,39 @@ package org.apache.kyuubi.spark.connector.tpch
 import io.trino.tpch.TpchTable
 import io.trino.tpch.TpchTable._
 
+import org.apache.kyuubi.spark.connector.tpch.TPCHSchemaUtils.{normalize, SCALES}
+
 // https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v3.0.0.pdf
 // Page 88 Table 3: Estimated Database Size
 object TPCHStatisticsUtils {
 
-  val SCALES: Array[Int] = Array(0, 1, 10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000)
-
-  def numRows(table: TpchTable[_], scale: Int): Long = {
-    require(SCALES.contains(scale), s"Unsupported scale $scale")
-    (table, scale) match {
-      case (_, 0) => 0L
-      case (CUSTOMER, scale) => 150000L * scale
-      case (ORDERS, scale) => 1500000L * scale
-      case (LINE_ITEM, 1) => 6001215L
-      case (LINE_ITEM, 10) => 59986052L
-      case (LINE_ITEM, 30) => 179998372L
-      case (LINE_ITEM, 100) => 600037902L
-      case (LINE_ITEM, 300) => 1799989091L
-      case (LINE_ITEM, 1000) => 5999989709L
-      case (LINE_ITEM, 3000) => 18000048306L
-      case (LINE_ITEM, 10000) => 59999994267L
-      case (LINE_ITEM, 30000) => 179999978268L
-      case (LINE_ITEM, 100000) => 599999969200L
-      case (PART, scale) => 200000L * scale
-      case (PART_SUPPLIER, scale) => 800000L * scale
-      case (SUPPLIER, scale) => 10000L * scale
+  def numRows(table: TpchTable[_], scale: Double): Long = {
+    val nScale = normalize(scale)
+    require(SCALES.contains(nScale), s"Unsupported scale $nScale")
+    (table, nScale) match {
+      case (_, "0") => 0L
+      case (CUSTOMER, nScale) => (150000L * nScale.toDouble).toLong
+      case (ORDERS, nScale) => (1500000L * nScale.toDouble).toLong
+      case (LINE_ITEM, "0.01") => 60175L
+      case (LINE_ITEM, "1") => 6001215L
+      case (LINE_ITEM, "10") => 59986052L
+      case (LINE_ITEM, "30") => 179998372L
+      case (LINE_ITEM, "100") => 600037902L
+      case (LINE_ITEM, "300") => 1799989091L
+      case (LINE_ITEM, "1000") => 5999989709L
+      case (LINE_ITEM, "3000") => 18000048306L
+      case (LINE_ITEM, "10000") => 59999994267L
+      case (LINE_ITEM, "30000") => 179999978268L
+      case (LINE_ITEM, "100000") => 599999969200L
+      case (PART, nScale) => (200000L * nScale.toDouble).toLong
+      case (PART_SUPPLIER, nScale) => (800000L * nScale.toDouble).toLong
+      case (SUPPLIER, nScale) => (10000L * nScale.toDouble).toLong
       case (NATION, _) => 25L
       case (REGION, _) => 5L
     }
   }
 
-  def sizeInBytes(table: TpchTable[_], scale: Int): Long =
+  def sizeInBytes(table: TpchTable[_], scale: Double): Long =
     numRows(table, scale) * TABLE_AVG_ROW_BYTES(table)
 
   private val TABLE_AVG_ROW_BYTES: Map[TpchTable[_], Long] = Map(
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHTable.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/main/scala/org/apache/kyuubi/spark/connector/tpch/TPCHTable.scala
@@ -30,15 +30,15 @@ import org.apache.spark.sql.connector.read.ScanBuilder
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
-class TPCHTable(tbl: String, scale: Int, options: CaseInsensitiveStringMap)
+class TPCHTable(tbl: String, scale: Double, options: CaseInsensitiveStringMap)
   extends SparkTable with SupportsRead {
 
   // When true, use CHAR VARCHAR; otherwise use STRING
   val useAnsiStringType: Boolean = options.getBoolean("useAnsiStringType", false)
 
   val tpchTable: TpchTable[_] = TpchTable.getTable(tbl)
 
-  override def name: String = s"sf$scale.$tbl"
+  override def name: String = s"${TPCHSchemaUtils.dbName(scale)}.$tbl"
 
   override def toString: String = s"TPCHTable($name)"
 
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalogSuite.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHCatalogSuite.scala
@@ -45,19 +45,19 @@ class TPCHCatalogSuite extends KyuubiFunSuite {
 
   test("supports namespaces") {
     spark.sql("use tpch")
-    assert(spark.sql(s"SHOW DATABASES").collect().length == 11)
+    assert(spark.sql(s"SHOW DATABASES").collect().length == 12)
     assert(spark.sql(s"SHOW NAMESPACES IN tpch.sf1").collect().length == 0)
   }
 
-  test("tpch.sf1 count") {
-    assert(spark.table("tpch.sf1.customer").count === 150000)
-    assert(spark.table("tpch.sf1.orders").count === 1500000)
-    assert(spark.table("tpch.sf1.lineitem").count === 6001215)
-    assert(spark.table("tpch.sf1.part").count === 200000)
-    assert(spark.table("tpch.sf1.partsupp").count === 800000)
-    assert(spark.table("tpch.sf1.supplier").count === 10000)
-    assert(spark.table("tpch.sf1.nation").count === 25)
-    assert(spark.table("tpch.sf1.region").count === 5)
+  test("tpch.tiny count") {
+    assert(spark.table("tpch.tiny.customer").count === 1500)
+    assert(spark.table("tpch.tiny.orders").count === 15000)
+    assert(spark.table("tpch.tiny.lineitem").count === 60175)
+    assert(spark.table("tpch.tiny.part").count === 2000)
+    assert(spark.table("tpch.tiny.partsupp").count === 8000)
+    assert(spark.table("tpch.tiny.supplier").count === 100)
+    assert(spark.table("tpch.tiny.nation").count === 25)
+    assert(spark.table("tpch.tiny.region").count === 5)
   }
 
   test("tpch.sf1 stats") {
diff --git a/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHSchemaUtilsSuite.scala b/extensions/spark/kyuubi-spark-connector-tpch/src/test/scala/org/apache/kyuubi/spark/connector/tpch/TPCHSchemaUtilsSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.tpch
+
+import org.apache.kyuubi.KyuubiFunSuite
+import org.apache.kyuubi.spark.connector.tpch.TPCHSchemaUtils.normalize
+
+class TPCHSchemaUtilsSuite extends KyuubiFunSuite {
+
+  test("normalize scale") {
+    assert(normalize(1) === "1")
+    assert(normalize(0.010000000000000000001) === "0.01")
+    assert(normalize(1.000000000000000000001) === "1")
+    assert(normalize(0.999999999999999999999) === "1")
+    assert(normalize(9.999999999999999999999) === "10")
+  }
+}