From 5fe5d6209502d81e4b2de4c106f487af51a05731 Mon Sep 17 00:00:00 2001 From: Denise Wiedl Date: Mon, 18 Aug 2025 16:38:29 -0700 Subject: [PATCH 1/4] chore: add docs explaining FieldMetadata::merge_options --- datafusion/expr/src/expr.rs | 45 ++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 9e2ac794de49..2324ae79c0ce 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -469,7 +469,50 @@ impl FieldMetadata { } /// Merges two optional `FieldMetadata` instances, overwriting any existing - /// keys in `m` with keys from `n` if present + /// keys in `m` with keys from `n` if present. + /// + /// This function is commonly used in alias operations, particularly for literals + /// with metadata. When creating an alias expression, the metadata from the original + /// expression (such as a literal) is combined with any metadata specified on the alias. + /// + /// # Arguments + /// + /// * `m` - The first metadata (typically from the original expression like a literal) + /// * `n` - The second metadata (typically from the alias definition) + /// + /// # Merge Strategy + /// + /// - If both metadata instances exist, they are merged with `n` taking precedence + /// - Keys from `n` will overwrite keys from `m` if they have the same name + /// - If only one metadata instance exists, it is returned unchanged + /// - If neither exists, `None` is returned + /// + /// # Example usage + /// ```rust + /// use datafusion_expr::expr::FieldMetadata; + /// use std::collections::BTreeMap; + /// + /// // Create metadata for a literal expression + /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([ + /// ("source".to_string(), "constant".to_string()), + /// ("type".to_string(), "int".to_string()), + /// ]))); + /// + /// // Create metadata for an alias + /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([ + /// ("description".to_string(), "answer".to_string()), + /// ("source".to_string(), "user".to_string()), // This will override literal's "source" + /// ]))); + /// + /// // Merge the metadata + /// let merged = FieldMetadata::merge_options( + /// literal_metadata.as_ref(), + /// alias_metadata.as_ref(), + /// ); + /// + /// // Result contains: {"source": "user", "type": "int", "description": "answer"} + /// assert!(merged.is_some()); + /// ``` pub fn merge_options( m: Option<&FieldMetadata>, n: Option<&FieldMetadata>, From 35f89bc9d0c5133dfb82e335a5c876b9c2c97376 Mon Sep 17 00:00:00 2001 From: Denise Wiedl Date: Tue, 19 Aug 2025 12:57:25 -0700 Subject: [PATCH 2/4] chore: document DFSchema::merge, which is used in logical plan construction & modification (e.g. LP optimizers) --- datafusion/common/src/dfschema.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index daf4e19ce0f6..ec95e05bf59f 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -297,6 +297,19 @@ impl DFSchema { /// Modify this schema by appending the fields from the supplied schema, ignoring any /// duplicate fields. + /// + /// ## Merge Precedence + /// + /// **Schema-level metadata**: Metadata from both schemas is merged. + /// If both schemas have the same metadata key, the value from the second `other_schema` parameter takes precedence. + /// + /// **Field-level merging**: Only non-duplicate fields are added. This means that the + /// first `self` fields will always take precedence over the second `other_schema` fields. + /// Duplicate field detection is based on: + /// - For qualified fields: both qualifier and field name must match + /// - For unqualified fields: only field name needs to match + /// + /// Note: the merging operation prefers the first `self` fields, and the second `other_schema` metadata. pub fn merge(&mut self, other_schema: &DFSchema) { if other_schema.inner.fields.is_empty() { return; From 4494bf17d0a0113eed158c0aa8f30b8708402533 Mon Sep 17 00:00:00 2001 From: Denise Wiedl Date: Tue, 19 Aug 2025 12:50:43 -0700 Subject: [PATCH 3/4] chore: merge_schema utils method --- datafusion/expr/src/utils.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 7a612b6fe6eb..2e364d0d2b80 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -1225,6 +1225,9 @@ pub fn only_or_err(slice: &[T]) -> Result<&T> { } /// merge inputs schema into a single schema. +/// +/// This function merges schemas from multiple logical plan inputs using [`DFSchema::merge`]. +/// Refer to that documentation for details on precedence and metadata handling. pub fn merge_schema(inputs: &[&LogicalPlan]) -> DFSchema { if inputs.len() == 1 { inputs[0].schema().as_ref().clone() From 25c798abdac5c08186d895d0517e4865688d5401 Mon Sep 17 00:00:00 2001 From: Denise Wiedl Date: Thu, 21 Aug 2025 11:50:48 -0700 Subject: [PATCH 4/4] chore: clarify wording --- datafusion/common/src/dfschema.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index ec95e05bf59f..d3dda2888214 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -301,15 +301,16 @@ impl DFSchema { /// ## Merge Precedence /// /// **Schema-level metadata**: Metadata from both schemas is merged. - /// If both schemas have the same metadata key, the value from the second `other_schema` parameter takes precedence. + /// If both schemas have the same metadata key, the value from the `other_schema` parameter takes precedence. /// /// **Field-level merging**: Only non-duplicate fields are added. This means that the - /// first `self` fields will always take precedence over the second `other_schema` fields. + /// `self` fields will always take precedence over the `other_schema` fields. /// Duplicate field detection is based on: /// - For qualified fields: both qualifier and field name must match /// - For unqualified fields: only field name needs to match /// - /// Note: the merging operation prefers the first `self` fields, and the second `other_schema` metadata. + /// Take note how the precedence for fields & metadata merging differs; + /// merging prefers fields from `self` but prefers metadata from `other_schema`. pub fn merge(&mut self, other_schema: &DFSchema) { if other_schema.inner.fields.is_empty() { return;