From bf9e4114f62c1b8c3c7a407df5f9a23c68f6e8d4 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 16:43:03 +0800 Subject: [PATCH 1/7] Encode EmptyRelation plans as Substrait virtual tables Transform EmptyRelation plans to produce a null-typed row as Substrait virtual tables. This change allows Substrait consumption to recognize zero-column virtual tables with one row as placeholder EmptyRelation plans, restoring produce_one_row semantics on roundtrip. Added a test for literal queries without a FROM clause to ensure correct behavior. --- .../src/logical_plan/consumer/rel/read_rel.rs | 11 ++++++ .../src/logical_plan/producer/rel/read_rel.rs | 36 ++++++++++++++----- .../tests/cases/roundtrip_logical_plan.rs | 5 +++ 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index 48e93c04bb03..23e6a721cc66 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -121,6 +121,17 @@ pub async fn from_read_rel( })); } + if vt.values.len() == 1 + && vt.expressions.is_empty() + && substrait_schema.fields().is_empty() + && vt.values[0].fields.is_empty() + { + return Ok(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: true, + schema: DFSchemaRef::new(substrait_schema), + })); + } + let values = if !vt.expressions.is_empty() { let mut exprs = vec![]; for row in &vt.expressions { diff --git a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs index 4b2e3782108b..56b8516e1f73 100644 --- a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs @@ -18,9 +18,10 @@ use crate::logical_plan::producer::{ to_substrait_literal, to_substrait_named_struct, SubstraitProducer, }; -use datafusion::common::{not_impl_err, substrait_datafusion_err, DFSchema, ToDFSchema}; +use datafusion::common::{substrait_datafusion_err, DFSchema, ToDFSchema}; use datafusion::logical_expr::utils::conjunction; use datafusion::logical_expr::{EmptyRelation, Expr, TableScan, Values}; +use datafusion::scalar::ScalarValue; use std::sync::Arc; use substrait::proto::expression::literal::Struct; use substrait::proto::expression::mask_expression::{StructItem, StructSelect}; @@ -87,22 +88,39 @@ pub fn from_empty_relation( producer: &mut impl SubstraitProducer, e: &EmptyRelation, ) -> datafusion::common::Result> { - if e.produce_one_row { - return not_impl_err!("Producing a row from empty relation is unsupported"); - } + let base_schema = to_substrait_named_struct(producer, &e.schema)?; + + let read_type = if e.produce_one_row { + let fields = e + .schema + .fields() + .iter() + .map(|f| { + let scalar = ScalarValue::try_from(f.data_type())?; + to_substrait_literal(producer, &scalar) + }) + .collect::>()?; + + ReadType::VirtualTable(VirtualTable { + values: vec![Struct { fields }], + expressions: vec![], + }) + } else { + ReadType::VirtualTable(VirtualTable { + values: vec![], + expressions: vec![], + }) + }; #[allow(deprecated)] Ok(Box::new(Rel { rel_type: Some(RelType::Read(Box::new(ReadRel { common: None, - base_schema: Some(to_substrait_named_struct(producer, &e.schema)?), + base_schema: Some(base_schema), filter: None, best_effort_filter: None, projection: None, advanced_extension: None, - read_type: Some(ReadType::VirtualTable(VirtualTable { - values: vec![], - expressions: vec![], - })), + read_type: Some(read_type), }))), })) } diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index f14d4cbf1fcc..92bb5a5cfeb1 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -185,6 +185,11 @@ async fn simple_select() -> Result<()> { roundtrip("SELECT a, b FROM data").await } +#[tokio::test] +async fn roundtrip_literal_without_from() -> Result<()> { + roundtrip("SELECT 1 AS one").await +} + #[tokio::test] async fn wildcard_select() -> Result<()> { let plan = generate_plan_from_sql("SELECT * FROM data", true, false).await?; From f5e21748828c08189c7af408eccdea69bbf3a943 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 16:59:05 +0800 Subject: [PATCH 2/7] Update deprecated values handling in read_rel.rs Add #[allow(deprecated)] for values field in Producer initializations. Maintain usage of deprecated values field due to incomplete consumer support for new expressions field. Enhance Consumer compatibility by updating produce_one_row logic to accommodate both field patterns and handling for future-proof expressions field usage. --- .../src/logical_plan/consumer/rel/read_rel.rs | 86 +++++++++++-------- .../src/logical_plan/producer/rel/read_rel.rs | 5 +- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index 23e6a721cc66..0fbf7a4bc77e 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -114,6 +114,7 @@ pub async fn from_read_rel( .await } Some(ReadType::VirtualTable(vt)) => { + #[allow(deprecated)] if vt.values.is_empty() && vt.expressions.is_empty() { return Ok(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, @@ -121,11 +122,18 @@ pub async fn from_read_rel( })); } - if vt.values.len() == 1 + // Check for produce_one_row pattern in both old (values) and new (expressions) formats + #[allow(deprecated)] + let is_produce_one_row = (vt.values.len() == 1 && vt.expressions.is_empty() && substrait_schema.fields().is_empty() - && vt.values[0].fields.is_empty() - { + && vt.values[0].fields.is_empty()) + || (vt.expressions.len() == 1 + && vt.values.is_empty() + && substrait_schema.fields().is_empty() + && vt.expressions[0].fields.is_empty()); + + if is_produce_one_row { return Ok(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: true, schema: DFSchemaRef::new(substrait_schema), @@ -135,54 +143,56 @@ pub async fn from_read_rel( let values = if !vt.expressions.is_empty() { let mut exprs = vec![]; for row in &vt.expressions { - let mut name_idx = 0; let mut row_exprs = vec![]; for expression in &row.fields { - name_idx += 1; let expr = consumer - .consume_expression(expression, &DFSchema::empty()) + .consume_expression(expression, &substrait_schema) .await?; row_exprs.push(expr); } - if name_idx != named_struct.names.len() { + // For expressions, validate against top-level schema fields, not nested names + if row_exprs.len() != substrait_schema.fields().len() { return substrait_err!( - "Names list must match exactly to nested schema, but found {} uses for {} names", - name_idx, - named_struct.names.len() + "Field count mismatch: expected {} fields but found {} in virtual table row", + substrait_schema.fields().len(), + row_exprs.len() ); } exprs.push(row_exprs); } exprs } else { - vt - .values - .iter() - .map(|row| { - let mut name_idx = 0; - let lits = row - .fields - .iter() - .map(|lit| { - name_idx += 1; // top-level names are provided through schema - Ok(Expr::Literal(from_substrait_literal( - consumer, - lit, - &named_struct.names, - &mut name_idx, - )?, None)) - }) - .collect::>()?; - if name_idx != named_struct.names.len() { - return substrait_err!( - "Names list must match exactly to nested schema, but found {} uses for {} names", - name_idx, - named_struct.names.len() - ); - } - Ok(lits) - }) - .collect::>()? + #[allow(deprecated)] + { + vt + .values + .iter() + .map(|row| { + let mut name_idx = 0; + let lits = row + .fields + .iter() + .map(|lit| { + name_idx += 1; // top-level names are provided through schema + Ok(Expr::Literal(from_substrait_literal( + consumer, + lit, + &named_struct.names, + &mut name_idx, + )?, None)) + }) + .collect::>()?; + if name_idx != named_struct.names.len() { + return substrait_err!( + "Names list must match exactly to nested schema, but found {} uses for {} names", + name_idx, + named_struct.names.len() + ); + } + Ok(lits) + }) + .collect::>()? + } }; Ok(LogicalPlan::Values(Values { diff --git a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs index 56b8516e1f73..3a7c0100dcad 100644 --- a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs @@ -102,16 +102,17 @@ pub fn from_empty_relation( .collect::>()?; ReadType::VirtualTable(VirtualTable { + #[allow(deprecated)] values: vec![Struct { fields }], expressions: vec![], }) } else { ReadType::VirtualTable(VirtualTable { + #[allow(deprecated)] values: vec![], expressions: vec![], }) }; - #[allow(deprecated)] Ok(Box::new(Rel { rel_type: Some(RelType::Read(Box::new(ReadRel { common: None, @@ -152,7 +153,6 @@ pub fn from_values( Ok(Struct { fields }) }) .collect::>()?; - #[allow(deprecated)] Ok(Box::new(Rel { rel_type: Some(RelType::Read(Box::new(ReadRel { common: None, @@ -162,6 +162,7 @@ pub fn from_values( projection: None, advanced_extension: None, read_type: Some(ReadType::VirtualTable(VirtualTable { + #[allow(deprecated)] values, expressions: vec![], })), From 6bfb73bf19f65ae6057710004eaaf2a9ad924d6e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 17:44:43 +0800 Subject: [PATCH 3/7] Document produce_one_row usage and enhance tests Add detailed function-level documentation for produce_one_row, covering its purpose, usage scenarios, default values context, and rationale for deprecated fields. Improve pattern detection logic in read_rel.rs with clear explanations. Expand test coverage in roundtrip_logical_plan.rs with four new tests ensuring correct serialization/deserialization for EmptyRelation and related scenarios. Verify improvements to sqllogictest functionality. --- .../src/logical_plan/consumer/rel/read_rel.rs | 6 ++- .../src/logical_plan/producer/rel/read_rel.rs | 17 ++++++++ .../tests/cases/roundtrip_logical_plan.rs | 40 ++++++++++++++++++- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index 0fbf7a4bc77e..f6e87fd2485d 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -122,7 +122,11 @@ pub async fn from_read_rel( })); } - // Check for produce_one_row pattern in both old (values) and new (expressions) formats + // Check for produce_one_row pattern in both old (values) and new (expressions) formats. + // A VirtualTable with exactly one row containing only empty/default fields represents + // an EmptyRelation with produce_one_row=true. This pattern is used for queries without + // a FROM clause (e.g., "SELECT 1 AS one") where a single phantom row is needed to + // provide a context for evaluating scalar expressions. #[allow(deprecated)] let is_produce_one_row = (vt.values.len() == 1 && vt.expressions.is_empty() diff --git a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs index 3a7c0100dcad..1a5a5bb3e185 100644 --- a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs @@ -84,6 +84,15 @@ pub fn from_table_scan( })) } +/// Encodes an EmptyRelation as a Substrait VirtualTable. +/// +/// EmptyRelation represents a relation with no input data. When `produce_one_row` is true, +/// it generates a single row with all fields set to their default values (typically NULL). +/// This is used for queries without a FROM clause, such as "SELECT 1 AS one" or +/// "SELECT current_timestamp()". +/// +/// When `produce_one_row` is false, it represents a truly empty relation with no rows, +/// used in optimizations or as a placeholder. pub fn from_empty_relation( producer: &mut impl SubstraitProducer, e: &EmptyRelation, @@ -91,6 +100,10 @@ pub fn from_empty_relation( let base_schema = to_substrait_named_struct(producer, &e.schema)?; let read_type = if e.produce_one_row { + // Create one row with default scalar values for each field in the schema. + // For example, an Int32 field gets Int32(NULL), a Utf8 field gets Utf8(NULL), etc. + // This represents the "phantom row" that provides a context for evaluating + // scalar expressions in queries without a FROM clause. let fields = e .schema .fields() @@ -102,6 +115,10 @@ pub fn from_empty_relation( .collect::>()?; ReadType::VirtualTable(VirtualTable { + // Use deprecated 'values' field instead of 'expressions' because the consumer's + // nested expression support (RexType::Nested) is not yet implemented. + // The 'values' field uses literal::Struct which the consumer can properly + // deserialize with field name preservation. #[allow(deprecated)] values: vec![Struct { fields }], expressions: vec![], diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 92bb5a5cfeb1..cf64435a453d 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -32,8 +32,8 @@ use datafusion::execution::registry::SerializerRegistry; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::session_state::SessionStateBuilder; use datafusion::logical_expr::{ - Extension, InvariantLevel, LogicalPlan, PartitionEvaluator, Repartition, - UserDefinedLogicalNode, Values, Volatility, + EmptyRelation, Extension, InvariantLevel, LogicalPlan, PartitionEvaluator, + Repartition, UserDefinedLogicalNode, Values, Volatility, }; use datafusion::optimizer::simplify_expressions::expr_simplifier::THRESHOLD_INLINE_INLIST; use datafusion::prelude::*; @@ -190,6 +190,42 @@ async fn roundtrip_literal_without_from() -> Result<()> { roundtrip("SELECT 1 AS one").await } +#[tokio::test] +async fn roundtrip_empty_relation_with_schema() -> Result<()> { + // Test produce_one_row=true with multiple typed columns + roundtrip("SELECT 1::int as a, 'hello'::text as b, 3.14::double as c").await +} + +#[tokio::test] +async fn roundtrip_empty_relation_no_rows() -> Result<()> { + // Test produce_one_row=false + let ctx = create_context().await?; + let plan = LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: DFSchemaRef::new(DFSchema::empty()), + }); + roundtrip_logical_plan_with_ctx(plan, ctx).await?; + Ok(()) +} + +#[tokio::test] +async fn roundtrip_subquery_with_empty_relation() -> Result<()> { + // Test EmptyRelation in the context of scalar subqueries. + // This pattern appears in aggregate queries (related to original bug report). + // The optimizer may simplify the subquery away, but we're testing that + // the EmptyRelation round-trips correctly when it appears in the plan. + let ctx = create_context().await?; + let df = ctx.sql("SELECT (SELECT 1) as nested").await?; + let plan = df.into_optimized_plan()?; + + // Just verify the round-trip succeeds and produces valid results + let proto = to_substrait_plan(&plan, &ctx.state())?; + let plan2 = from_substrait_plan(&ctx.state(), &proto).await?; + let df2 = DataFrame::new(ctx.state(), plan2); + df2.show().await?; + Ok(()) +} + #[tokio::test] async fn wildcard_select() -> Result<()> { let plan = generate_plan_from_sql("SELECT * FROM data", true, false).await?; From c8465123a98f15a85241c8212f68d94b0408a762 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 18:07:06 +0800 Subject: [PATCH 4/7] remove allow(deprecated) --- datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index f6e87fd2485d..e0c99eb3f9ac 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -114,7 +114,6 @@ pub async fn from_read_rel( .await } Some(ReadType::VirtualTable(vt)) => { - #[allow(deprecated)] if vt.values.is_empty() && vt.expressions.is_empty() { return Ok(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, @@ -127,7 +126,6 @@ pub async fn from_read_rel( // an EmptyRelation with produce_one_row=true. This pattern is used for queries without // a FROM clause (e.g., "SELECT 1 AS one") where a single phantom row is needed to // provide a context for evaluating scalar expressions. - #[allow(deprecated)] let is_produce_one_row = (vt.values.len() == 1 && vt.expressions.is_empty() && substrait_schema.fields().is_empty() @@ -166,7 +164,6 @@ pub async fn from_read_rel( } exprs } else { - #[allow(deprecated)] { vt .values From 12ee2a671eadbc3f2537d1457e9eb554afbb954a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 18:25:09 +0800 Subject: [PATCH 5/7] Refactor from_read_rel to use convert_literal_rows for processing VirtualTable literals --- .../src/logical_plan/consumer/rel/read_rel.rs | 71 +++++++++++-------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index e0c99eb3f9ac..9d22b5e060d3 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -164,36 +164,7 @@ pub async fn from_read_rel( } exprs } else { - { - vt - .values - .iter() - .map(|row| { - let mut name_idx = 0; - let lits = row - .fields - .iter() - .map(|lit| { - name_idx += 1; // top-level names are provided through schema - Ok(Expr::Literal(from_substrait_literal( - consumer, - lit, - &named_struct.names, - &mut name_idx, - )?, None)) - }) - .collect::>()?; - if name_idx != named_struct.names.len() { - return substrait_err!( - "Names list must match exactly to nested schema, but found {} uses for {} names", - name_idx, - named_struct.names.len() - ); - } - Ok(lits) - }) - .collect::>()? - } + convert_literal_rows(consumer, vt, named_struct)? }; Ok(LogicalPlan::Values(Values { @@ -248,6 +219,46 @@ pub async fn from_read_rel( } } +/// Converts Substrait literal rows from a VirtualTable into DataFusion expressions. +/// +/// This function processes the deprecated `values` field of VirtualTable, converting +/// each literal value into a `Expr::Literal` while tracking and validating the name +/// indices against the provided named struct schema. +fn convert_literal_rows( + consumer: &impl SubstraitConsumer, + vt: &substrait::proto::read_rel::VirtualTable, + named_struct: &substrait::proto::NamedStruct, +) -> datafusion::common::Result>> { + #[allow(deprecated)] + vt.values + .iter() + .map(|row| { + let mut name_idx = 0; + let lits = row + .fields + .iter() + .map(|lit| { + name_idx += 1; // top-level names are provided through schema + Ok(Expr::Literal(from_substrait_literal( + consumer, + lit, + &named_struct.names, + &mut name_idx, + )?, None)) + }) + .collect::>()?; + if name_idx != named_struct.names.len() { + return substrait_err!( + "Names list must match exactly to nested schema, but found {} uses for {} names", + name_idx, + named_struct.names.len() + ); + } + Ok(lits) + }) + .collect::>() +} + pub fn apply_masking( schema: DFSchema, mask_expression: &::core::option::Option, From dacf601a4a30f432e1b1654221b68526078e2b21 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 20 Nov 2025 18:45:14 +0800 Subject: [PATCH 6/7] Amend comment from roundtrip_subquery_with_empty_relation test --- datafusion/substrait/tests/cases/roundtrip_logical_plan.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index cf64435a453d..d931dd58d8ef 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -211,7 +211,6 @@ async fn roundtrip_empty_relation_no_rows() -> Result<()> { #[tokio::test] async fn roundtrip_subquery_with_empty_relation() -> Result<()> { // Test EmptyRelation in the context of scalar subqueries. - // This pattern appears in aggregate queries (related to original bug report). // The optimizer may simplify the subquery away, but we're testing that // the EmptyRelation round-trips correctly when it appears in the plan. let ctx = create_context().await?; From d0bf84e2ba79875ce581af2ef56787c550e575d0 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 21 Nov 2025 14:29:21 +0800 Subject: [PATCH 7/7] Enhance comment in from_read_rel to clarify the concept of EmptyRelation and its similarity to SQL "DUAL" table --- .../substrait/src/logical_plan/consumer/rel/read_rel.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index 9d22b5e060d3..35062695e8ac 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -125,7 +125,10 @@ pub async fn from_read_rel( // A VirtualTable with exactly one row containing only empty/default fields represents // an EmptyRelation with produce_one_row=true. This pattern is used for queries without // a FROM clause (e.g., "SELECT 1 AS one") where a single phantom row is needed to - // provide a context for evaluating scalar expressions. + // provide a context for evaluating scalar expressions. This is conceptually similar to + // the SQL "DUAL" table (see: https://en.wikipedia.org/wiki/DUAL_table) which some + // databases provide as a single-row source for selecting constant expressions when no + // real table is present. let is_produce_one_row = (vt.values.len() == 1 && vt.expressions.is_empty() && substrait_schema.fields().is_empty()