Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9d40c16
fix: replace direct cast with cast_column utility for better type han…
kosiew Sep 16, 2025
4377845
fix: enhance type casting for Utf8 data in statistics to ensure safe …
kosiew Sep 16, 2025
5a73f7d
test: add unit test for building statistics with struct columns
kosiew Sep 16, 2025
0fbaeae
test: add unit tests for handling struct statistics errors in build_s…
kosiew Sep 16, 2025
e6ea9e1
fix: update casting options to use safe semantics for binary statistics
kosiew Sep 16, 2025
fe25700
fix: use safe casting for binary statistics to prevent invalid UTF-8 …
kosiew Sep 16, 2025
7cac892
feat: add support for nested struct column statistics in pruning logic
kosiew Sep 16, 2025
af03ada
fix: use safe casting options for binary statistics to handle invalid…
kosiew Sep 16, 2025
835e362
fix: adjust indentation for clarity in statistics record batch castin…
kosiew Sep 16, 2025
06cbcc8
fix: enhance UTF-8 validation for binary statistics in pruning logic
kosiew Sep 16, 2025
7102a14
refactor: simplify struct column statistics handling and improve requ…
kosiew Sep 16, 2025
4c3e992
fix: implement safe casting for invalid UTF-8 in statistics handling
kosiew Sep 16, 2025
ae87b25
fix: ensure invalid UTF-8 is converted to null in statistics record b…
kosiew Sep 16, 2025
904e448
fix: enhance casting logic for statistics handling of binary and larg…
kosiew Sep 16, 2025
edb83b3
fix: implement safe casting for invalid UTF-8 bytes in statistics rec…
kosiew Sep 16, 2025
d1d1c0b
fix: replace assertions with assert_contains for better error message…
kosiew Sep 16, 2025
d9c0bef
fix: enhance statistics handling for binary and UTF-8 types, adding s…
kosiew Sep 16, 2025
0d365d5
fix: replace assert with assert_contains for improved error message c…
kosiew Sep 16, 2025
d9def1f
fix: remove unused variable in build_statistics_record_batch function
kosiew Sep 16, 2025
f74b7b2
fix: remove unused imports in pruning_predicate.rs for cleaner code
kosiew Sep 16, 2025
f087733
fix: add TransformedResult to datafusion_common tree_node imports for…
kosiew Sep 16, 2025
8a36e51
fix: remove TransformedResult import for cleaner code in pruning_pred…
kosiew Sep 18, 2025
f465e87
fix: add TransformedResult import in pruning_predicate.rs for improve…
kosiew Sep 18, 2025
d85c342
feat: add sanitize_binary_array_for_utf8 function to preprocess Binar…
kosiew Sep 18, 2025
27f8004
fix: update condition to sanitize binary arrays for UTF-8 compatibility
kosiew Sep 18, 2025
5be80d5
test: add unit test for building statistics from BinaryViewArray to U…
kosiew Sep 18, 2025
f3afa05
fix: enhance sanitize_binary_array_for_utf8 to check for invalid UTF-…
kosiew Sep 18, 2025
d0c6227
feat: enhance struct casting to support safe UTF-8 conversion from Bi…
kosiew Sep 18, 2025
8e27fde
Merge branch 'main' into prune-16579
kosiew Sep 18, 2025
7c81a51
refactor: reorganize imports in pruning_predicate.rs for clarity
kosiew Sep 18, 2025
bf499f5
refactor: simplify array initialization in tests for clarity
kosiew Sep 18, 2025
1751bcf
refactor: improve documentation for sanitize_binary_array_for_utf8 to…
kosiew Sep 18, 2025
5cb940b
refactor: enhance test documentation for statistics casting and handl…
kosiew Sep 18, 2025
e1b53d2
refactor: add comments to clarify manual UTF-8 validation in sanitize…
kosiew Sep 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 69 additions & 6 deletions datafusion/common/src/nested_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@

use crate::error::{Result, _plan_err};
use arrow::{
array::{new_null_array, Array, ArrayRef, StructArray},
array::{
new_null_array, Array, ArrayRef, AsArray as _, BinaryViewBuilder, StructArray,
},
compute::{cast_with_options, CastOptions},
datatypes::{DataType::Struct, Field, FieldRef},
datatypes::{DataType, Field, FieldRef},
};
use std::sync::Arc;
use std::{str, sync::Arc};

/// Cast a struct column to match target struct fields, handling nested structs recursively.
///
Expand Down Expand Up @@ -151,9 +153,30 @@ pub fn cast_column(
cast_options: &CastOptions,
) -> Result<ArrayRef> {
match target_field.data_type() {
Struct(target_fields) => {
DataType::Struct(target_fields) => {
cast_struct_column(source_col, target_fields, cast_options)
}
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
let mut options = cast_options.clone();
let mut source: ArrayRef = Arc::clone(source_col);

if matches!(
source_col.data_type(),
DataType::Binary | DataType::LargeBinary | DataType::BinaryView
) {
options.safe = true;

if matches!(source_col.data_type(), DataType::BinaryView) {
source = sanitize_binary_array_for_utf8(source);
}
}

Ok(cast_with_options(
&source,
target_field.data_type(),
&options,
)?)
}
_ => Ok(cast_with_options(
source_col,
target_field.data_type(),
Expand All @@ -162,6 +185,46 @@ pub fn cast_column(
}
}

/// Sanitizes a `BinaryView` array so that any element containing invalid UTF-8
/// is converted to null before casting to a UTF-8 string array.
///
/// This only transforms the array's values (not any external statistics). Other
/// binary array representations are returned unchanged because Arrow's safe
/// casts already convert invalid UTF-8 sequences to null for those types.
pub fn sanitize_binary_array_for_utf8(array: ArrayRef) -> ArrayRef {
match array.data_type() {
DataType::BinaryView => {
let binary_view = array.as_binary_view();

// Check if all bytes are already valid UTF-8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems redundant with calling https://docs.rs/arrow/latest/arrow/compute/fn.cast_with_options.html with CastOptions::safe = true 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the existing cast kernel handles invalid ut8f already

It does not handle BinaryView

#[test]
    fn test_arrow_cast_binaryview_to_utf8view_fails_with_invalid_utf8() {
        // Arrow's BinaryView -> Utf8View casting fails even with safe=true when
        // encountering invalid UTF-8, unlike other binary array types which
        // convert invalid UTF-8 to null with safe casting.
        use arrow::compute::kernels::cast::{cast_with_options, CastOptions};

        // Create BinaryView with invalid UTF-8
        let binary_data = vec![
            Some("valid".as_bytes()),
            Some(&[0xf0, 0x28, 0x8c, 0x28]), // invalid UTF-8 sequence
            Some("also_valid".as_bytes()),
        ];
        let binary_view_array: ArrayRef = Arc::new(BinaryViewArray::from(binary_data));

        // Try casting with safe=false (should fail)
        let cast_options = CastOptions::default(); // safe=false by default
        let result =
            cast_with_options(&binary_view_array, &DataType::Utf8View, &cast_options);
        assert!(
            result.is_err(),
            "Expected BinaryView->Utf8View cast to fail with safe=false"
        );
        assert!(
            result
                .unwrap_err()
                .to_string()
                .contains("Encountered non-UTF-8 data"),
            "Error should mention non-UTF-8 data"
        );

        // Try casting with safe=true (should still fail for BinaryView!)
        let mut safe_cast_options = CastOptions::default();
        safe_cast_options.safe = true;
        let safe_result = cast_with_options(
            &binary_view_array,
            &DataType::Utf8View,
            &safe_cast_options,
        );
        assert!(
            safe_result.is_err(),
            "BinaryView->Utf8View cast fails even with safe=true (unlike other binary types)"
        );
        assert!(
            safe_result
                .unwrap_err()
                .to_string()
                .contains("Encountered non-UTF-8 data"),
            "Safe cast error should also mention non-UTF-8 data"
        );

    }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that is a great find and I am sorry I did not understand that context.

This sounds like a bug in upstream arrow -- I think we should file a ticket in arrow-rs and then leave a reference to that ticket in this implementation (so we can remove the datafusion code when the corresponding code in arrow-rs is released_

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will create the issue.

// Manual validation is required for BinaryView because Arrow's safe cast
// doesn't handle invalid UTF-8 sequences properly for this array type
// See: https://github.com/apache/arrow-rs/issues/8403
let has_invalid_bytes = binary_view.iter().any(
|value| matches!(value, Some(bytes) if str::from_utf8(bytes).is_err()),
);

if !has_invalid_bytes {
return array;
}

let mut builder = BinaryViewBuilder::with_capacity(binary_view.len());

for value in binary_view.iter() {
match value {
Some(bytes) if str::from_utf8(bytes).is_ok() => {
builder.append_value(bytes)
}
_ => builder.append_null(),
}
}

Arc::new(builder.finish()) as ArrayRef
}
_ => array,
}
}

/// Validates compatibility between source and target struct fields for casting operations.
///
/// This function implements comprehensive struct compatibility checking by examining:
Expand Down Expand Up @@ -220,7 +283,7 @@ pub fn validate_struct_compatibility(
// Check if the matching field types are compatible
match (source_field.data_type(), target_field.data_type()) {
// Recursively validate nested structs
(Struct(source_nested), Struct(target_nested)) => {
(DataType::Struct(source_nested), DataType::Struct(target_nested)) => {
validate_struct_compatibility(source_nested, target_nested)?;
}
// For non-struct types, use the existing castability check
Expand Down Expand Up @@ -284,7 +347,7 @@ mod tests {
}

fn struct_type(fields: Vec<Field>) -> DataType {
Struct(fields.into())
DataType::Struct(fields.into())
}

fn struct_field(name: &str, fields: Vec<Field>) -> Field {
Expand Down
Loading