From b4a877c628664ad0c1a9ca3c3db5ec983ce10775 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Mon, 16 Mar 2026 20:53:54 +0100 Subject: [PATCH] [VL] Gluten-it: Fix broken `--decimal-as-double` --- .../apache/gluten/integration/Constants.scala | 10 +++++++++ .../apache/gluten/integration/DataGen.scala | 2 ++ .../org/apache/gluten/integration/Suite.scala | 2 +- .../clickbench/ClickBenchSuite.scala | 18 ++++++---------- .../gluten/integration/ds/TpcdsSuite.scala | 21 +++++++------------ .../gluten/integration/h/TpchSuite.scala | 19 +++++++---------- .../table/LayoutTableCreator.scala | 13 +++++++++--- .../integration/table/TableCreator.scala | 6 ++++-- 8 files changed, 48 insertions(+), 43 deletions(-) diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala index 19867b6476b0..9252334be96b 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Constants.scala @@ -131,6 +131,8 @@ object Constants { @deprecated val TYPE_MODIFIER_DATE_AS_DOUBLE: TypeModifier = new TypeModifier(TypeUtils.typeAccepts(_, DateType), DoubleType) { + override def name(): String = "data_as_double" + override def modValue(from: Any): Any = { from match { case v: Date => v.getTime.asInstanceOf[Double] / 86400.0d / 1000.0d @@ -141,6 +143,8 @@ object Constants { @deprecated val TYPE_MODIFIER_INTEGER_AS_DOUBLE: TypeModifier = new TypeModifier(TypeUtils.typeAccepts(_, IntegerType), DoubleType) { + override def name(): String = "integer_as_double" + override def modValue(from: Any): Any = { from match { case v: Int => v.asInstanceOf[Double] @@ -151,6 +155,8 @@ object Constants { @deprecated val TYPE_MODIFIER_LONG_AS_DOUBLE: TypeModifier = new TypeModifier(TypeUtils.typeAccepts(_, LongType), DoubleType) { + override def name(): String = "long_as_double" + override def modValue(from: Any): Any = { from match { case v: Long => v.asInstanceOf[Double] @@ -161,6 +167,8 @@ object Constants { @deprecated val TYPE_MODIFIER_DATE_AS_STRING: TypeModifier = new TypeModifier(TypeUtils.typeAccepts(_, DateType), StringType) { + override def name(): String = "date_as_string" + override def modValue(from: Any): Any = { from match { case v: Date => v.toString @@ -170,6 +178,8 @@ object Constants { val TYPE_MODIFIER_DECIMAL_AS_DOUBLE: TypeModifier = new TypeModifier(TypeUtils.decimalAccepts, DoubleType) { + override def name(): String = "decimal_as_double" + override def modValue(from: Any): Any = { from match { case v: java.math.BigDecimal => v.doubleValue() diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala index b7bdc122b041..e1ef410f55cf 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/DataGen.scala @@ -27,10 +27,12 @@ trait DataGen { abstract class TypeModifier(val predicate: DataType => Boolean, val to: DataType) extends Serializable { + def name(): String def modValue(value: Any): Any } class NoopModifier(t: DataType) extends TypeModifier(_ => true, t) { + override def name(): String = "noop" override def modValue(value: Any): Any = value } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala index eca23473b3eb..16aa00ae4f90 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala @@ -264,7 +264,7 @@ abstract class Suite( testMetricMapper } - private[integration] def typeModifiers(): List[TypeModifier] = { + private[integration] def typeModifiers(): Seq[TypeModifier] = { if (decimalAsDouble) List(TYPE_MODIFIER_DECIMAL_AS_DOUBLE) else List() } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala index a93c7e3746ac..385406a90779 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala @@ -84,6 +84,12 @@ class ClickBenchSuite( ) { import ClickBenchSuite._ + require( + Set("parquet").contains(dataSource), + s"Data source type $dataSource is not supported by ClickBench suite") + require(dataScale == 1.0d, "ClickBench suite doesn't support scale factor other than 1") + require(!genPartitionedData, "ClickBench suite doesn't support generating partitioned data") + override protected def historyWritePath(): String = HISTORY_WRITE_PATH override private[integration] def dataWritePath(): String = { @@ -91,7 +97,6 @@ class ClickBenchSuite( } override private[integration] def createDataGen(): DataGen = { - checkDataGenArgs(dataSource, dataScale, genPartitionedData) new ClickBenchDataGen(dataWritePath()) } @@ -112,15 +117,4 @@ private object ClickBenchSuite { private val DATA_WRITE_RELATIVE_PATH = "clickbench-generated" private val HISTORY_WRITE_PATH = "/tmp/clickbench-history" private val ALL_QUERY_IDS = (1 to 43).map(i => s"q$i").toArray - - private def checkDataGenArgs( - dataSource: String, - scale: Double, - genPartitionedData: Boolean): Unit = { - require( - Set("parquet").contains(dataSource), - s"Data source type $dataSource is not supported by ClickBench suite") - require(scale == 1.0d, "ClickBench suite doesn't support scale factor other than 1") - require(!genPartitionedData, "ClickBench suite doesn't support generating partitioned data") - } } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala index b99104ded50d..c6a0b26e8590 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala @@ -78,7 +78,9 @@ class TpcdsSuite( ) { import TpcdsSuite._ - checkDataGenArgs(dataSource, dataScale, genPartitionedData) + require( + Set("parquet", "delta").contains(dataSource), + s"Data source type $dataSource is not supported by TPC-DS suite") private val tableLayout = new TpcdsTableLayout(genPartitionedData) @@ -90,9 +92,10 @@ class TpcdsSuite( } else { "non_partitioned" } - val featureFlags = dataGenFeatures.map(feature => s"-$feature").mkString("") + val typeModifierFlags = typeModifiers().map(m => s"-${m.name()}").sorted.mkString("-") + val featureFlags = dataGenFeatures.map(feature => s"-$feature").sorted.mkString("") val relative = - s"$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags" + s"$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$typeModifierFlags$featureFlags" new Path(dataDir, relative).toString } @@ -113,7 +116,8 @@ class TpcdsSuite( override private[integration] def desc(): String = "TPC-DS" - override def tableCreator(): TableCreator = TableCreator.createFromLayout(tableLayout) + override def tableCreator(): TableCreator = + TableCreator.createFromLayout(tableLayout, typeModifiers()) override def tableAnalyzer0(): TableAnalyzer = TableAnalyzer.analyzeAll() } @@ -226,13 +230,4 @@ object TpcdsSuite { "q99" ) private val HISTORY_WRITE_PATH = "/tmp/tpcds-history" - - private def checkDataGenArgs( - dataSource: String, - scale: Double, - genPartitionedData: Boolean): Unit = { - require( - Set("parquet", "delta").contains(dataSource), - s"Data source type $dataSource is not supported by TPC-DS suite") - } } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala index 059aff4ab7c3..28d234101ce3 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala @@ -82,17 +82,22 @@ class TpchSuite( ) { import TpchSuite._ + require( + Set("parquet", "delta").contains(dataSource), + s"Data source type $dataSource is not supported by TPC-H suite") + require(!genPartitionedData, "TPC-H suite doesn't support generating partitioned data") + override protected def historyWritePath(): String = HISTORY_WRITE_PATH override private[integration] def dataWritePath(): String = { + val typeModifierFlags = typeModifiers().map(m => s"-${m.name()}").mkString("-") val featureFlags = dataGenFeatures.map(feature => s"-$feature").mkString("") val relative = - s"$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags" + s"$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$typeModifierFlags$featureFlags" new Path(dataDir, relative).toString } override private[integration] def createDataGen(): DataGen = { - checkDataGenArgs(dataSource, dataScale, genPartitionedData) new TpchDataGen( dataScale, shufflePartitions, @@ -139,14 +144,4 @@ object TpchSuite { "q21", "q22") private val HISTORY_WRITE_PATH = "/tmp/tpch-history" - - private def checkDataGenArgs( - dataSource: String, - scale: Double, - genPartitionedData: Boolean): Unit = { - require( - Set("parquet", "delta").contains(dataSource), - s"Data source type $dataSource is not supported by TPC-H suite") - require(!genPartitionedData, "TPC-H suite doesn't support generating partitioned data") - } } diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/LayoutTableCreator.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/LayoutTableCreator.scala index f153c0222bfd..aa72f59acde2 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/LayoutTableCreator.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/LayoutTableCreator.scala @@ -16,6 +16,8 @@ */ package org.apache.gluten.integration.table +import org.apache.gluten.integration.{DataGen, TypeModifier} + import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.hadoop.fs.{FileSystem, Path} @@ -23,7 +25,9 @@ import org.apache.hadoop.fs.{FileSystem, Path} import java.net.URI import scala.collection.mutable.ArrayBuffer -class LayoutTableCreator(layout: TableLayout) extends TableCreator { + +class LayoutTableCreator(layout: TableLayout, typeModifiers: Seq[TypeModifier]) + extends TableCreator { override def create(spark: SparkSession, source: String, dataPath: String): Unit = { val uri = URI.create(dataPath) val fs = FileSystem.get(uri, spark.sessionState.newHadoopConf()) @@ -51,14 +55,17 @@ class LayoutTableCreator(layout: TableLayout) extends TableCreator { val schema = layout.getSchema(tableName) val partitionColumns = layout.getPartitionColumns(tableName) - val nonPartitionColumns = schema.fields + val rowModifier = DataGen.getRowModifier(schema, typeModifiers) + val modifiedSchema = DataGen.modifySchema(schema, rowModifier) + + val nonPartitionColumns = modifiedSchema.fields .filterNot(f => partitionColumns.contains(f.name)) .map(f => s"${f.name} ${f.dataType.sql}") .mkString(", ") val partitionDDL = if (partitionColumns.nonEmpty) - s"PARTITIONED BY (${partitionColumns.map(c => s"$c ${schema(c).dataType.sql}").mkString(", ")})" + s"PARTITIONED BY (${partitionColumns.map(c => s"$c ${modifiedSchema(c).dataType.sql}").mkString(", ")})" else "" diff --git a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/TableCreator.scala b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/TableCreator.scala index 1af92ae36899..fbad48dd87b4 100644 --- a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/TableCreator.scala +++ b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/table/TableCreator.scala @@ -16,6 +16,8 @@ */ package org.apache.gluten.integration.table +import org.apache.gluten.integration.TypeModifier + import org.apache.spark.sql.SparkSession trait TableCreator { @@ -27,7 +29,7 @@ object TableCreator { AutoTableCreator } - def createFromLayout(layout: TableLayout): TableCreator = { - new LayoutTableCreator(layout) + def createFromLayout(layout: TableLayout, typeModifiers: Seq[TypeModifier]): TableCreator = { + new LayoutTableCreator(layout, typeModifiers) } }