From e1688c8812cc692d61c1977175e0ca2fa3fe67eb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 19 Dec 2023 12:34:35 -0800 Subject: [PATCH 1/3] Consider nullability when casting between structs --- arrow-cast/src/cast.rs | 77 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 0775392b7d6..2932bfe39a8 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -160,11 +160,18 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, // Utf8 to decimal (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, - (Struct(from_fields), Struct(to_fields)) => { - from_fields.len() == to_fields.len() && - from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { - can_cast_types(f1.data_type(), f2.data_type()) - }) + (Struct(from_fields), Struct(to_fields)) => { + from_fields.len() == to_fields.len() && + from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { + // Only allows cast nullable or non-nullable fields to nullable fields. + // Although it is generally allowed to cast non-nullable fields to non-nullable fields, + // but if casting f1.data_type to f2.data_type may return null, e.g., overflow, + // it is not allowed when f2 is non-nullable. Because `can_cast_types` doesn't + // take `CastOptions` so we cannot check `safe` option here. We take safer + // approach to assume `safe` is true, so disallowing the case of casting non-nullable + // fields to non-nullable fields. + f2.is_nullable() && can_cast_types(f1.data_type(), f2.data_type()) + }) } (Struct(_), _) => false, (_, Struct(_)) => false, @@ -9525,4 +9532,64 @@ mod tests { result.unwrap_err().to_string() ); } + + #[test] + fn test_cast_struct_to_struct_disallow_nullability() { + let from_type = DataType::Struct( + vec![ + Field::new("a", DataType::Boolean, true), + Field::new("b", DataType::Int32, true), + ] + .into(), + ); + + // allow: nullable to nullable + let to_type = DataType::Struct( + vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ] + .into(), + ); + assert!(can_cast_types(&from_type, &to_type)); + + // disallow: nullable to non-nullable + let to_type = DataType::Struct( + vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, false), + ] + .into(), + ); + assert!(!can_cast_types(&from_type, &to_type)); + + let from_type = DataType::Struct( + vec![ + Field::new("a", DataType::Boolean, false), + Field::new("b", DataType::Int32, false), + ] + .into(), + ); + + // allow: non-nullable to nullable + let to_type = DataType::Struct( + vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ] + .into(), + ); + assert!(can_cast_types(&from_type, &to_type)); + + // disallow: non-nullable to non-nullable + // because casting may return null when overflow + let to_type = DataType::Struct( + vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, false), + ] + .into(), + ); + assert!(!can_cast_types(&from_type, &to_type)); + } } From a3bc7b08f7354f5e153b09814727415dfa577b83 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 19 Dec 2023 16:13:03 -0800 Subject: [PATCH 2/3] Fix --- arrow-buffer/src/buffer/null.rs | 2 +- arrow-cast/src/cast.rs | 86 ++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index c79aef39805..0792db052c4 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -86,7 +86,7 @@ impl NullBuffer { } let lhs = self.inner().bit_chunks().iter_padded(); let rhs = other.inner().bit_chunks().iter_padded(); - lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) + lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) // l:0 -> r:0 or 1, l:1 -> r:1 } /// Returns a new [`NullBuffer`] where each bit in the current null buffer diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 2932bfe39a8..92b9071a675 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -163,14 +163,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Struct(from_fields), Struct(to_fields)) => { from_fields.len() == to_fields.len() && from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { - // Only allows cast nullable or non-nullable fields to nullable fields. - // Although it is generally allowed to cast non-nullable fields to non-nullable fields, - // but if casting f1.data_type to f2.data_type may return null, e.g., overflow, - // it is not allowed when f2 is non-nullable. Because `can_cast_types` doesn't - // take `CastOptions` so we cannot check `safe` option here. We take safer - // approach to assume `safe` is true, so disallowing the case of casting non-nullable - // fields to non-nullable fields. - f2.is_nullable() && can_cast_types(f1.data_type(), f2.data_type()) + // Assume that nullability between two structs are compatible, if not, + // cast kernel will return error. + can_cast_types(f1.data_type(), f2.data_type()) }) } (Struct(_), _) => false, @@ -1159,7 +1154,7 @@ pub fn cast_with_options( .zip(to_fields.iter()) .map(|(l, field)| cast_with_options(l, field.data_type(), cast_options)) .collect::, ArrowError>>()?; - let array = StructArray::new(to_fields.clone(), fields, array.nulls().cloned()); + let array = StructArray::try_new(to_fields.clone(), fields, array.nulls().cloned())?; Ok(Arc::new(array) as ArrayRef) } (Struct(_), _) => Err(ArrowError::CastError( @@ -9534,62 +9529,75 @@ mod tests { } #[test] - fn test_cast_struct_to_struct_disallow_nullability() { - let from_type = DataType::Struct( - vec![ - Field::new("a", DataType::Boolean, true), - Field::new("b", DataType::Int32, true), - ] - .into(), - ); + fn test_cast_struct_to_struct_nullability() { + let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); + let int = Arc::new(Int32Array::from(vec![Some(42), None, Some(19), None])); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Boolean, false)), + boolean.clone() as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, true)), + int.clone() as ArrayRef, + ), + ]); - // allow: nullable to nullable + // okay: nullable to nullable let to_type = DataType::Struct( vec![ - Field::new("a", DataType::Utf8, true), + Field::new("a", DataType::Utf8, false), Field::new("b", DataType::Utf8, true), ] .into(), ); - assert!(can_cast_types(&from_type, &to_type)); + cast(&struct_array, &to_type).expect("Cast nullable to nullable struct field should work"); - // disallow: nullable to non-nullable + // error: nullable to non-nullable let to_type = DataType::Struct( vec![ - Field::new("a", DataType::Utf8, true), + Field::new("a", DataType::Utf8, false), Field::new("b", DataType::Utf8, false), ] .into(), ); - assert!(!can_cast_types(&from_type, &to_type)); + cast(&struct_array, &to_type) + .expect_err("Cast nullable to non-nullable struct field should fail"); - let from_type = DataType::Struct( - vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Int32, false), - ] - .into(), - ); + let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); + let int = Arc::new(Int32Array::from(vec![i32::MAX, 25, 1, 100])); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Boolean, false)), + boolean.clone() as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + int.clone() as ArrayRef, + ), + ]); - // allow: non-nullable to nullable + // okay: non-nullable to non-nullable let to_type = DataType::Struct( vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Utf8, true), + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), ] .into(), ); - assert!(can_cast_types(&from_type, &to_type)); + cast(&struct_array, &to_type) + .expect("Cast non-nullable to non-nullable struct field should work"); - // disallow: non-nullable to non-nullable - // because casting may return null when overflow + // err: non-nullable to non-nullable but overflowing return null during casting let to_type = DataType::Struct( vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Utf8, false), + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Int8, false), ] .into(), ); - assert!(!can_cast_types(&from_type, &to_type)); + cast(&struct_array, &to_type).expect_err( + "Cast non-nullable to non-nullable struct field returning null should fail", + ); } } From 7cbed19d23c2de91d5c29b99fa644b979cb8f5dd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 20 Dec 2023 00:00:28 -0800 Subject: [PATCH 3/3] Remove --- arrow-buffer/src/buffer/null.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 0792db052c4..c79aef39805 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -86,7 +86,7 @@ impl NullBuffer { } let lhs = self.inner().bit_chunks().iter_padded(); let rhs = other.inner().bit_chunks().iter_padded(); - lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) // l:0 -> r:0 or 1, l:1 -> r:1 + lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) } /// Returns a new [`NullBuffer`] where each bit in the current null buffer