From 6d98418d7f996f2da13d314c575c25bbeeaa37b3 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Sat, 30 May 2026 12:46:54 +0530 Subject: [PATCH 1/3] fix: recompute_schema() now checks types and names for LogicalPlan::Union Previously the Union arm only validated field count (width), causing stale cached schemas when inputs were rewritten with different types or column names (e.g. after type-coercion). Fix uses qualified_field() to deep-compare qualifier + data_type + name. Fixes #22447 --- datafusion/expr/src/logical_plan/plan.rs | 97 +++++++++++++++++++++++- 1 file changed, 95 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index e7e03bcac5150..1c801951c32d8 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -709,8 +709,20 @@ impl LogicalPlan { } LogicalPlan::Union(Union { inputs, schema }) => { let first_input_schema = inputs[0].schema(); - if schema.fields().len() == first_input_schema.fields().len() { - // If inputs are not pruned do not change schema + // Check field count AND field types AND field names/qualifiers. + // A width-only check misses cases where inputs were rewritten with + // different types or aliases (e.g. after type-coercion rewrites). + let schemas_match = schema.fields().len() == first_input_schema.fields().len() + && (0..schema.fields().len()).all(|i| { + let (q1, f1) = schema.qualified_field(i); + let (q2, f2) = first_input_schema.qualified_field(i); + q1 == q2 + && f1.data_type() == f2.data_type() + && f1.name() == f2.name() + }); + if schemas_match { + // Inputs are structurally identical to the cached schema; + // no recomputation needed. Ok(LogicalPlan::Union(Union { inputs, schema })) } else { // A note on `Union`s constructed via `try_new_by_name`: @@ -6070,4 +6082,85 @@ mod tests { Ok(()) } + + #[test] + fn test_recompute_schema_union_type_mismatch() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + + let schema_i32 = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let schema_i64 = Schema::new(vec![Field::new("a", DataType::Int64, false)]); + + // Build a Union whose schema starts out as Int32 (matching its inputs). + let original = Union::try_new(vec![ + Arc::new(table_scan(Some("t1"), &schema_i32, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_i32, None)?.build()?), + ])?; + assert_eq!( + original.schema.field(0).data_type(), + &DataType::Int32, + "sanity: starting schema is Int32" + ); + + // Simulate a rewrite pass (e.g. type-coercion) that replaced the inputs + // with Int64-typed versions while leaving the Union's cached schema stale. + // Same width, different types — this is exactly the bug scenario. + let stale = LogicalPlan::Union(Union { + inputs: vec![ + Arc::new(table_scan(Some("t1"), &schema_i64, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_i64, None)?.build()?), + ], + schema: Arc::clone(&original.schema), + }); + + let recomputed = stale.recompute_schema()?; + + assert_eq!( + recomputed.schema().field(0).data_type(), + &DataType::Int64, + "Union schema should track the new Int64 input types after \ + recompute_schema(), but the width-only check left it stale" + ); + + Ok(()) + } + + #[test] + fn test_recompute_schema_union_name_mismatch() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + + let schema_a = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let schema_b = Schema::new(vec![Field::new("b", DataType::Int32, false)]); + + // Build a Union whose schema starts out with column "a". + let original = Union::try_new(vec![ + Arc::new(table_scan(Some("t1"), &schema_a, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_a, None)?.build()?), + ])?; + assert_eq!( + original.schema.field(0).name(), + "a", + "sanity: starting schema has column name 'a'" + ); + + // Simulate a rewrite pass that renamed the columns but left + // the cached schema stale. Same width and type, different name. + let stale = LogicalPlan::Union(Union { + inputs: vec![ + Arc::new(table_scan(Some("t1"), &schema_b, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_b, None)?.build()?), + ], + schema: Arc::clone(&original.schema), + }); + + let recomputed = stale.recompute_schema()?; + + assert_eq!( + recomputed.schema().field(0).name(), + "b", + "Union schema should reflect the renamed column after \ + recompute_schema(), but the width-only check left it stale" + ); + + Ok(()) + } } From 21d3eaad4607e52544c8975746eaf40f8a081a5b Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Sat, 30 May 2026 12:57:00 +0530 Subject: [PATCH 2/3] fix(expr): recompute_schema() correctly detects stale Union schema Previously the Union arm only checked field count (width), causing stale cached schemas when inputs were rewritten with different types, names, qualifiers, or nullability but the same column count. Fix: call Union::try_new(inputs) to get the authoritative recomputed schema, then compare against the cached schema via DFSchema PartialEq. This handles all field properties in one place and is future-proof. Added three unit tests covering type, name, and nullability mismatches. Fixes #22447 --- datafusion/expr/src/logical_plan/plan.rs | 85 +++++++++++++++++------- 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 991d950f12213..45a808d18d362 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -708,31 +708,28 @@ impl LogicalPlan { })) } LogicalPlan::Union(Union { inputs, schema }) => { - let first_input_schema = inputs[0].schema(); - // Check field count AND field types AND field names/qualifiers. - // A width-only check misses cases where inputs were rewritten with - // different types or aliases (e.g. after type-coercion rewrites). - let schemas_match = schema.fields().len() == first_input_schema.fields().len() - && (0..schema.fields().len()).all(|i| { - let (q1, f1) = schema.qualified_field(i); - let (q2, f2) = first_input_schema.qualified_field(i); - q1 == q2 - && f1.data_type() == f2.data_type() - && f1.name() == f2.name() - }); - if schemas_match { - // Inputs are structurally identical to the cached schema; - // no recomputation needed. - Ok(LogicalPlan::Union(Union { inputs, schema })) + // Recompute what the schema should be from the current inputs. + // Comparing the full recomputed schema (not just inputs[0]) correctly + // handles: field-count changes, type changes, name/alias changes, + // nullability changes, and metadata changes — all in one place. + // + // A note on `Union`s constructed via `try_new_by_name`: + // + // At this point, the schema for each input should have + // the same width. Thus, we do not need to save whether a + // `Union` was created `BY NAME`, and can safely rely on the + // `try_new` initializer to derive the new schema based on + // column positions. + let recomputed = Union::try_new(inputs)?; + if recomputed.schema == schema { + // Schema is still valid; preserve the cached schema + // (which may carry metadata the recomputed one lacks). + Ok(LogicalPlan::Union(Union { + inputs: recomputed.inputs, + schema, + })) } else { - // A note on `Union`s constructed via `try_new_by_name`: - // - // At this point, the schema for each input should have - // the same width. Thus, we do not need to save whether a - // `Union` was created `BY NAME`, and can safely rely on the - // `try_new` initializer to derive the new schema based on - // column positions. - Ok(LogicalPlan::Union(Union::try_new(inputs)?)) + Ok(LogicalPlan::Union(recomputed)) } } LogicalPlan::Distinct(distinct) => { @@ -6218,4 +6215,44 @@ mod tests { Ok(()) } + + #[test] + fn test_recompute_schema_union_nullability_mismatch() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + + // nullable: false + let schema_not_null = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + // nullable: true + let schema_nullable = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + + // Build Union starting with NOT NULL inputs. + let original = Union::try_new(vec![ + Arc::new(table_scan(Some("t1"), &schema_not_null, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_not_null, None)?.build()?), + ])?; + assert!( + !original.schema.field(0).is_nullable(), + "sanity: starting schema field is NOT NULL" + ); + + // Simulate a rewrite that made the inputs nullable while leaving + // the Union's cached schema stale. + let stale = LogicalPlan::Union(Union { + inputs: vec![ + Arc::new(table_scan(Some("t1"), &schema_nullable, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema_nullable, None)?.build()?), + ], + schema: Arc::clone(&original.schema), + }); + + let recomputed = stale.recompute_schema()?; + + assert!( + recomputed.schema().field(0).is_nullable(), + "Union schema should reflect the new nullable inputs after \ + recompute_schema(), but the stale NOT NULL schema was kept" + ); + + Ok(()) + } } From 38a313d71eb6028236d1fe588c3d1dab455b268d Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Sun, 31 May 2026 11:44:55 +0530 Subject: [PATCH 3/3] fix(expr): recompute_schema() correctly detects stale Union schema Previously the Union arm only checked field count (width), causing stale cached schemas when inputs were rewritten with different types, names, qualifiers, or nullability but the same column count. Fix: - Fast path: structural comparison across ALL inputs (field count, type, name, qualifier, nullability) with zero allocation on the common no-change case, as suggested by the reviewer. - Slow path: Union::try_new() for structure, then HashMap::extend() semantics for schema-level and field-level metadata preservation, matching the behavior of coerce_union_schema_with_schema in type_coercion.rs. Added four unit tests covering type mismatch, name mismatch, nullability mismatch, and metadata preservation. Fixes #22447 --- datafusion/expr/src/logical_plan/plan.rs | 133 ++++++++++++++++++++--- 1 file changed, 117 insertions(+), 16 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 9865a284c0901..b4636e158571e 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -708,29 +708,76 @@ impl LogicalPlan { })) } LogicalPlan::Union(Union { inputs, schema }) => { - // Recompute what the schema should be from the current inputs. - // Comparing the full recomputed schema (not just inputs[0]) correctly - // handles: field-count changes, type changes, name/alias changes, - // nullability changes, and metadata changes — all in one place. - // - // A note on `Union`s constructed via `try_new_by_name`: + // Fast path: if all inputs structurally match the cached schema + // (field count, types, names, qualifiers, nullability) then no + // recomputation is needed and we avoid any allocation. + let schemas_match = inputs.iter().all(|input| { + let input_schema = input.schema(); + schema.fields().len() == input_schema.fields().len() + && schema.iter().zip(input_schema.iter()).all( + |((q1, f1), (q2, f2))| { + q1 == q2 + && f1.name() == f2.name() + && f1.data_type() == f2.data_type() + && f1.is_nullable() == f2.is_nullable() + }, + ) + }); + if schemas_match { + // Inputs are structurally identical to the cached schema. + return Ok(LogicalPlan::Union(Union { inputs, schema })); + } + + // Slow path: inputs changed — recompute the schema. // + // NOTE: A note on `Union`s constructed via `try_new_by_name`: // At this point, the schema for each input should have // the same width. Thus, we do not need to save whether a // `Union` was created `BY NAME`, and can safely rely on the // `try_new` initializer to derive the new schema based on // column positions. - let recomputed = Union::try_new(inputs)?; - if recomputed.schema == schema { - // Schema is still valid; preserve the cached schema - // (which may carry metadata the recomputed one lacks). - Ok(LogicalPlan::Union(Union { - inputs: recomputed.inputs, - schema, - })) - } else { - Ok(LogicalPlan::Union(recomputed)) + let mut recomputed = Union::try_new(inputs)?; + + // Metadata preservation: Union::try_new uses intersection logic + // for metadata, but we want "later takes precedence" (extend semantics) + // to match coerce_union_schema_with_schema in type_coercion.rs. + let mut merged_metadata = + recomputed.inputs[0].schema().metadata().clone(); + for input in recomputed.inputs.iter().skip(1) { + merged_metadata.extend(input.schema().metadata().clone()); } + + let mut merged_field_metadata = recomputed.inputs[0] + .schema() + .fields() + .iter() + .map(|f| f.metadata().clone()) + .collect::>(); + + for input in recomputed.inputs.iter().skip(1) { + for (field_meta, input_field) in merged_field_metadata + .iter_mut() + .zip(input.schema().fields()) + { + field_meta.extend(input_field.metadata().clone()); + } + } + + let new_fields = recomputed + .schema + .iter() + .zip(merged_field_metadata) + .map(|((qualifier, field), meta)| { + let mut field = field.as_ref().clone(); + field.set_metadata(meta); + (qualifier.cloned(), Arc::new(field)) + }) + .collect::>(); + + recomputed.schema = + Arc::new(DFSchema::new_with_metadata(new_fields, merged_metadata)?); + + Ok(LogicalPlan::Union(recomputed)) } LogicalPlan::Distinct(distinct) => { let distinct = match distinct { @@ -6258,4 +6305,58 @@ mod tests { Ok(()) } + + #[test] + fn test_recompute_schema_union_metadata_preservation() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + use std::collections::HashMap; + + let mut meta1 = HashMap::new(); + meta1.insert("k1".to_string(), "v1".to_string()); + let mut meta2 = HashMap::new(); + meta2.insert("k1".to_string(), "v2".to_string()); // duplicate key, different value + meta2.insert("k2".to_string(), "v2".to_string()); + + let schema1 = Schema::new_with_metadata( + vec![Field::new("a", DataType::Int32, false)], + meta1.clone(), + ); + let schema2 = Schema::new_with_metadata( + vec![Field::new("a", DataType::Int32, false)], + meta2.clone(), + ); + + // Build a Union. Its initial schema will have intersected metadata. + let original = Union::try_new(vec![ + Arc::new(table_scan(Some("t1"), &schema1, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema2, None)?.build()?), + ])?; + + // Union::try_new uses intersection, so k1 should be missing (v1 != v2) + // and k2 should be missing (not in meta1). + assert!(original.schema.metadata().is_empty()); + + // Now simulate recompute_schema() where we want EXTEND semantics (later takes precedence). + // Our implementation of recompute_schema for Union now does this. + let stale = LogicalPlan::Union(Union { + inputs: vec![ + Arc::new(table_scan(Some("t1"), &schema1, None)?.build()?), + Arc::new(table_scan(Some("t2"), &schema2, None)?.build()?), + ], + // Use a dummy schema that forces recomputation (e.g. different name) + schema: Arc::new(DFSchema::try_from(Schema::new(vec![Field::new( + "wrong_name", + DataType::Int32, + false, + )]))?), + }); + + let recomputed = stale.recompute_schema()?; + + // Metadata should now be {k1: v2, k2: v2} because meta2 was the last input. + assert_eq!(recomputed.schema().metadata().get("k1").unwrap(), "v2"); + assert_eq!(recomputed.schema().metadata().get("k2").unwrap(), "v2"); + + Ok(()) + } }