From a6964b3606e83ee209961d4ee4ecbec93cc1c9ac Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Tue, 14 Oct 2025 15:32:51 -0400 Subject: [PATCH 1/7] [GLUTEN-8851] Validate the plan before execution --- .../apache/gluten/config/VeloxConfig.scala | 9 ++++++++ .../extension/CudfNodeValidationRule.scala | 21 ++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index 093186f569a1..73059f841981 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -78,6 +78,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) { def cudfEnableTableScan: Boolean = getConf(CUDF_ENABLE_TABLE_SCAN) + def cudfEnableValidation: Boolean = getConf(CUDF_ENABLE_VALIDATION) + def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES) def parquetUseColumnNames: Boolean = getConf(PARQUET_USE_COLUMN_NAMES) @@ -624,6 +626,13 @@ object VeloxConfig extends ConfigRegistry { .booleanConf .createWithDefault(false) + val CUDF_ENABLE_VALIDATION = + buildStaticConf("spark.gluten.sql.columnar.backend.velox.cudf.enableValidation") + .doc("Heuristics you can apply to validate a cuDF/GPU plan and only offload when " + + "the entire stage can be fully and profitably executed on GPU") + .booleanConf + .createWithDefault(true) + val MEMORY_DUMP_ON_EXIT = buildConf("spark.gluten.monitor.memoryDumpOnExit") .internal() diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala index 20e819e21582..b8bda269d6ab 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala @@ -17,8 +17,8 @@ package org.apache.gluten.extension import org.apache.gluten.config.{GlutenConfig, VeloxConfig} -import org.apache.gluten.execution.{CudfTag, LeafTransformSupport, WholeStageTransformer} - +import org.apache.gluten.cudf.VeloxCudfPlanValidatorJniWrapper +import org.apache.gluten.execution.{CudfTag, LeafTransformSupport, TransformSupport, WholeStageTransformer} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlan @@ -37,7 +37,22 @@ case class CudfNodeValidationRule(glutenConf: GlutenConfig) extends Rule[SparkPl case _: LeafTransformSupport => true case _ => false }.isDefined - transformer.setTagValue(CudfTag.CudfTag, !hasLeaf) + if (!hasLeaf && VeloxConfig.get.cudfEnableValidation) { + if ( + VeloxCudfPlanValidatorJniWrapper.validate( + transformer.substraitPlan.toProtobuf.toByteArray) + ) { + transformer.foreach { + case _: LeafTransformSupport => + case t: TransformSupport => + t.setTagValue(CudfTag.CudfTag, true) + case _ => + } + transformer.setTagValue(CudfTag.CudfTag, true) + } + } else { + transformer.setTagValue(CudfTag.CudfTag, !hasLeaf) + } } else { transformer.setTagValue(CudfTag.CudfTag, true) } From 39730d85dc6c433f15016dc1770eee34d9987493 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Thu, 16 Oct 2025 10:40:14 +0100 Subject: [PATCH 2/7] fix code style --- .../main/scala/org/apache/gluten/config/VeloxConfig.scala | 5 +++-- .../org/apache/gluten/extension/CudfNodeValidationRule.scala | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index 73059f841981..f68a23c35072 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -628,8 +628,9 @@ object VeloxConfig extends ConfigRegistry { val CUDF_ENABLE_VALIDATION = buildStaticConf("spark.gluten.sql.columnar.backend.velox.cudf.enableValidation") - .doc("Heuristics you can apply to validate a cuDF/GPU plan and only offload when " + - "the entire stage can be fully and profitably executed on GPU") + .doc( + "Heuristics you can apply to validate a cuDF/GPU plan and only offload when " + + "the entire stage can be fully and profitably executed on GPU") .booleanConf .createWithDefault(true) diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala index b8bda269d6ab..a092b984c82d 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala @@ -19,6 +19,7 @@ package org.apache.gluten.extension import org.apache.gluten.config.{GlutenConfig, VeloxConfig} import org.apache.gluten.cudf.VeloxCudfPlanValidatorJniWrapper import org.apache.gluten.execution.{CudfTag, LeafTransformSupport, TransformSupport, WholeStageTransformer} + import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlan From 33eb45ac06a930d2b15b1e09ff8546b012574eb1 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Thu, 16 Oct 2025 10:46:46 +0100 Subject: [PATCH 3/7] fix --- cpp/velox/cudf/CudfPlanValidator.cc | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cpp/velox/cudf/CudfPlanValidator.cc b/cpp/velox/cudf/CudfPlanValidator.cc index cad8aa5afebd..f7ae397750bf 100644 --- a/cpp/velox/cudf/CudfPlanValidator.cc +++ b/cpp/velox/cudf/CudfPlanValidator.cc @@ -24,11 +24,21 @@ #include "velox/core/PlanNode.h" #include "velox/exec/Task.h" #include "velox/exec/TableScan.h" +#include "velox/experimental/cudf/exec/NvtxHelper.h" #include "velox/experimental/cudf/exec/ToCudf.h" + using namespace facebook; namespace gluten { + +namespace { + +bool isCudfOperator(const exec::Operator* op) { + return isAnyOf(op); +} + +} bool CudfPlanValidator::validate(const ::substrait::Plan& substraitPlan) { auto veloxMemoryPool = gluten::defaultLeafVeloxMemoryPool(); std::vector<::substrait::ReadRel_LocalFiles> localFiles; @@ -64,10 +74,9 @@ bool CudfPlanValidator::validate(const ::substrait::Plan& substraitPlan) { if (dynamic_cast(op) != nullptr) { continue; } - // TODO: wait for PR https://github.com/facebookincubator/velox/pull/13341 - // if (cudf_velox::isCudfOperator(op)) { - // continue; - // } + if (isCudfOperator(op)) { + continue; + } if (dynamic_cast(op) != nullptr) { continue; } From b3567eca0d8682bb971ec49a11fbdd7f065d9880 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Thu, 16 Oct 2025 10:47:47 +0100 Subject: [PATCH 4/7] fix --- cpp/velox/cudf/CudfPlanValidator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/velox/cudf/CudfPlanValidator.cc b/cpp/velox/cudf/CudfPlanValidator.cc index f7ae397750bf..f711b3365d8e 100644 --- a/cpp/velox/cudf/CudfPlanValidator.cc +++ b/cpp/velox/cudf/CudfPlanValidator.cc @@ -27,7 +27,6 @@ #include "velox/experimental/cudf/exec/NvtxHelper.h" #include "velox/experimental/cudf/exec/ToCudf.h" - using namespace facebook; namespace gluten { @@ -39,6 +38,7 @@ bool isCudfOperator(const exec::Operator* op) { } } + bool CudfPlanValidator::validate(const ::substrait::Plan& substraitPlan) { auto veloxMemoryPool = gluten::defaultLeafVeloxMemoryPool(); std::vector<::substrait::ReadRel_LocalFiles> localFiles; From 7089ad4c702162a793eb8b4e0cb8ebe4a8df48ac Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Thu, 16 Oct 2025 16:13:17 +0100 Subject: [PATCH 5/7] fix document --- docs/velox-configuration.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md index 44996a5cd49e..fd7cd5cdcafa 100644 --- a/docs/velox-configuration.md +++ b/docs/velox-configuration.md @@ -9,7 +9,7 @@ nav_order: 16 ## Gluten Velox backend configurations -| Key | Default | Description | +| Key | Default | Description | |----------------------------------------------------------------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | spark.gluten.sql.columnar.backend.velox.IOThreads | <undefined> | The Size of the IO thread pool in the Connector. This thread pool is used for split preloading and DirectBufferedInput. By default, the value is the same as the maximum task slots per Spark executor. | | spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver | 2 | The split preload per task | @@ -23,6 +23,7 @@ nav_order: 16 | spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct | 0 | Set prefetch cache min pct for velox file scan | | spark.gluten.sql.columnar.backend.velox.checkUsageLeak | true | Enable check memory usage leak. | | spark.gluten.sql.columnar.backend.velox.cudf.enableTableScan | false | Enable cudf table scan | +| spark.gluten.sql.columnar.backend.velox.cudf.enableValidation | true | Heuristics you can apply to validate a cuDF/GPU plan and only offload when the entire stage can be fully and profitably executed on GPU | | spark.gluten.sql.columnar.backend.velox.cudf.memoryPercent | 50 | The initial percent of GPU memory to allocate for memory resource for one thread. | | spark.gluten.sql.columnar.backend.velox.cudf.memoryResource | async | GPU RMM memory resource. | | spark.gluten.sql.columnar.backend.velox.directorySizeGuess | 32KB | Deprecated, rename to spark.gluten.sql.columnar.backend.velox.footerEstimatedSize | @@ -48,8 +49,8 @@ nav_order: 16 | spark.gluten.sql.columnar.backend.velox.memoryPoolCapacityTransferAcrossTasks | true | Whether to allow memory capacity transfer between memory pools from different tasks. | | spark.gluten.sql.columnar.backend.velox.memoryUseHugePages | false | Use explicit huge pages for Velox memory allocation. | | spark.gluten.sql.columnar.backend.velox.orc.scan.enabled | true | Enable velox orc scan. If disabled, vanilla spark orc scan will be used. | -| spark.gluten.sql.columnar.backend.velox.orcUseColumnNames | true | Maps table field names to file field names using names, not indices for ORC files. If this is set to false Gluten will fallback to vanilla Spark if it does not support all column types present in any of the schemas of the tables being read, at this time unsupported types include TimestampNTZ and Char. | -| spark.gluten.sql.columnar.backend.velox.parquetUseColumnNames | true | Maps table field names to file field names using names, not indices for Parquet files. If this is set to false Gluten will fallback to vanilla Spark if it does not support all column types present in any of the schemas of the tables being read, at this time unsupported types include TimestampNTZ and Char. | +| spark.gluten.sql.columnar.backend.velox.orcUseColumnNames | true | Maps table field names to file field names using names, not indices for ORC files. | +| spark.gluten.sql.columnar.backend.velox.parquetUseColumnNames | true | Maps table field names to file field names using names, not indices for Parquet files. | | spark.gluten.sql.columnar.backend.velox.prefetchRowGroups | 1 | Set the prefetch row groups for velox file scan | | spark.gluten.sql.columnar.backend.velox.queryTraceEnabled | false | Enable query tracing flag. | | spark.gluten.sql.columnar.backend.velox.reclaimMaxWaitMs | 3600000ms | The max time in ms to wait for memory reclaim. | From 9e7840cd1ef8c3457bf71a2b314397f84a980960 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 17 Oct 2025 14:11:00 +0100 Subject: [PATCH 6/7] fix compile --- cpp/velox/cudf/CudfPlanValidator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/velox/cudf/CudfPlanValidator.cc b/cpp/velox/cudf/CudfPlanValidator.cc index f711b3365d8e..2a8b41a37be7 100644 --- a/cpp/velox/cudf/CudfPlanValidator.cc +++ b/cpp/velox/cudf/CudfPlanValidator.cc @@ -34,7 +34,7 @@ namespace gluten { namespace { bool isCudfOperator(const exec::Operator* op) { - return isAnyOf(op); + return isAnyOf(op); } } From f5df61cea970f59599a9e0c8283be5f895e239c0 Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 17 Oct 2025 17:20:37 +0100 Subject: [PATCH 7/7] fix --- cpp/velox/cudf/CudfPlanValidator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/velox/cudf/CudfPlanValidator.cc b/cpp/velox/cudf/CudfPlanValidator.cc index 2a8b41a37be7..49949ca10071 100644 --- a/cpp/velox/cudf/CudfPlanValidator.cc +++ b/cpp/velox/cudf/CudfPlanValidator.cc @@ -34,7 +34,7 @@ namespace gluten { namespace { bool isCudfOperator(const exec::Operator* op) { - return isAnyOf(op); + return dynamic_cast(op) != nullptr; } }