Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 125 additions & 1 deletion parquet-variant-compute/src/shred_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@

use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use crate::variant_to_arrow::{
PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder,
PrimitiveVariantToArrowRowBuilder, VariantToBooleanArrowRowBuilder,
make_primitive_variant_to_arrow_row_builder,
};
use crate::{VariantArray, VariantValueArrayBuilder};
use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder};
Expand Down Expand Up @@ -123,6 +124,12 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>(
"Shredding variant array values as arrow lists".to_string(),
));
}
DataType::Boolean => {
let builder = VariantToBooleanArrowRowBuilder::new(cast_options, capacity);
let typed_value_builder =
VariantToShreddedBooleanVariantRowBuilder::new(builder, capacity, top_level);
VariantToShreddedVariantRowBuilder::Boolean(typed_value_builder)
}
_ => {
let builder =
make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
Expand All @@ -135,13 +142,15 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>(
}

pub(crate) enum VariantToShreddedVariantRowBuilder<'a> {
Boolean(VariantToShreddedBooleanVariantRowBuilder<'a>),
Primitive(VariantToShreddedPrimitiveVariantRowBuilder<'a>),
Object(VariantToShreddedObjectVariantRowBuilder<'a>),
}
impl<'a> VariantToShreddedVariantRowBuilder<'a> {
pub fn append_null(&mut self) -> Result<()> {
use VariantToShreddedVariantRowBuilder::*;
match self {
Boolean(b) => b.append_null(),
Primitive(b) => b.append_null(),
Object(b) => b.append_null(),
}
Expand All @@ -150,6 +159,7 @@ impl<'a> VariantToShreddedVariantRowBuilder<'a> {
pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
use VariantToShreddedVariantRowBuilder::*;
match self {
Boolean(b) => b.append_value(value),
Primitive(b) => b.append_value(value),
Object(b) => b.append_value(value),
}
Expand All @@ -158,12 +168,59 @@ impl<'a> VariantToShreddedVariantRowBuilder<'a> {
pub fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)> {
use VariantToShreddedVariantRowBuilder::*;
match self {
Boolean(b) => b.finish(),
Primitive(b) => b.finish(),
Object(b) => b.finish(),
}
}
}

pub(crate) struct VariantToShreddedBooleanVariantRowBuilder<'a> {
value_builder: VariantValueArrayBuilder,
typed_value_builder: VariantToBooleanArrowRowBuilder<'a>,
nulls: NullBufferBuilder,
top_level: bool,
}

impl<'a> VariantToShreddedBooleanVariantRowBuilder<'a> {
pub(crate) fn new(
typed_value_builder: VariantToBooleanArrowRowBuilder<'a>,
capacity: usize,
top_level: bool,
) -> Self {
Self {
value_builder: VariantValueArrayBuilder::new(capacity),
typed_value_builder,
nulls: NullBufferBuilder::new(capacity),
top_level,
}
}

fn append_null(&mut self) -> Result<()> {
self.nulls.append(!self.top_level);
self.value_builder.append_null();
self.typed_value_builder.append_null()
}

fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
self.nulls.append_non_null();
if self.typed_value_builder.append_value(&value)? {
self.value_builder.append_null();
} else {
self.value_builder.append_value(value);
}
Ok(true)
}

fn finish(mut self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)> {
Ok((
self.value_builder.build()?,
self.typed_value_builder.finish()?,
self.nulls.finish(),
))
}
}

/// A top-level variant shredder -- appending NULL produces typed_value=NULL and value=Variant::Null
pub(crate) struct VariantToShreddedPrimitiveVariantRowBuilder<'a> {
value_builder: VariantValueArrayBuilder,
Expand Down Expand Up @@ -380,6 +437,73 @@ mod tests {
shred_variant(&input, &list_schema).expect_err("unsupported");
}

#[test]
fn test_boolean_shredding() {
// Create a VariantArray with a mix of booleans, nulls, and non-boolean values
let input = create_test_variant_array(vec![
Some(Variant::from(true)), // should shred
Some(Variant::from(false)), // should shred
None, // array-level null
Some(Variant::Null), // variant null
Some(Variant::from(1i64)), // not a bool, should not shred
Some(Variant::from("true")), // not a bool, should not shred
]);

let result = shred_variant(&input, &DataType::Boolean).unwrap();

let value_field = result.value_field().unwrap();
let typed_value_field = result
.typed_value_field()
.unwrap()
.as_any()
.downcast_ref::<arrow::array::BooleanArray>()
.unwrap();

assert_eq!(result.len(), 6);

// Row 0: true
assert!(!result.is_null(0));
assert!(value_field.is_null(0));
assert!(!typed_value_field.is_null(0));
assert!(typed_value_field.value(0));

// Row 1: false
assert!(!result.is_null(1));
assert!(value_field.is_null(1));
assert!(!typed_value_field.is_null(1));
assert!(!typed_value_field.value(1));

// Row 2: array-level null
assert!(result.is_null(2));

// Row 3: Variant::Null
assert!(!result.is_null(3));
assert!(!value_field.is_null(3));
assert!(typed_value_field.is_null(3));
assert_eq!(
Variant::new(result.metadata_field().value(3), value_field.value(3)),
Variant::Null
);

// Row 4: 1i64 (not a bool)
assert!(!result.is_null(4));
assert!(!value_field.is_null(4));
assert!(typed_value_field.is_null(4));
assert_eq!(
Variant::new(result.metadata_field().value(4), value_field.value(4)),
Variant::from(1i64)
);

// Row 5: "true" (not a bool)
assert!(!result.is_null(5));
assert!(!value_field.is_null(5));
assert!(typed_value_field.is_null(5));
assert_eq!(
Variant::new(result.metadata_field().value(5), value_field.value(5)),
Variant::from("true")
);
}

#[test]
fn test_primitive_shredding_comprehensive() {
// Test mixed scenarios in a single array
Expand Down
49 changes: 49 additions & 0 deletions parquet-variant-compute/src/variant_to_arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly
/// with casting of leaf values to specific types.
pub(crate) enum VariantToArrowRowBuilder<'a> {
Boolean(VariantToBooleanArrowRowBuilder<'a>),
Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
BinaryVariant(VariantToBinaryVariantArrowRowBuilder),

Expand Down Expand Up @@ -112,6 +113,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
pub fn append_null(&mut self) -> Result<()> {
use VariantToArrowRowBuilder::*;
match self {
Boolean(b) => b.append_null(),
Primitive(b) => b.append_null(),
BinaryVariant(b) => b.append_null(),
WithPath(path_builder) => path_builder.append_null(),
Expand All @@ -121,6 +123,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
use VariantToArrowRowBuilder::*;
match self {
Boolean(b) => b.append_value(&value),
Primitive(b) => b.append_value(&value),
BinaryVariant(b) => b.append_value(value),
WithPath(path_builder) => path_builder.append_value(value),
Expand All @@ -130,6 +133,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
pub fn finish(self) -> Result<ArrayRef> {
use VariantToArrowRowBuilder::*;
match self {
Boolean(b) => b.finish(),
Primitive(b) => b.finish(),
BinaryVariant(b) => b.finish(),
WithPath(path_builder) => path_builder.finish(),
Expand Down Expand Up @@ -235,6 +239,9 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>(
"Converting unshredded variant arrays to arrow lists".to_string(),
));
}
Some(DataType::Boolean) => {
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity))
}
Some(data_type) => {
let builder =
make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
Expand Down Expand Up @@ -297,6 +304,48 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str {
}
}

/// Builder for converting variant values to boolean values
pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
builder: arrow::array::BooleanBuilder,
cast_options: &'a CastOptions<'a>,
}

impl<'a> VariantToBooleanArrowRowBuilder<'a> {
pub fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
Self {
builder: arrow::array::BooleanBuilder::with_capacity(capacity),
cast_options,
}
}

pub fn append_null(&mut self) -> Result<()> {
self.builder.append_null();
Ok(())
}

pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
if let Some(v) = value.as_boolean() {
self.builder.append_value(v);
Ok(true)
} else {
if !self.cast_options.safe {
// Unsafe casting: return error on conversion failure
return Err(ArrowError::CastError(format!(
"Failed to extract boolean from variant {:?} at path VariantPath([])",
value
)));
}
// Safe casting: append null on conversion failure
self.builder.append_null();
Ok(false)
}
}

pub fn finish(mut self) -> Result<ArrayRef> {
Ok(Arc::new(self.builder.finish()))
}
}

/// Builder for converting variant values to primitive values
pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: PrimitiveFromVariant> {
builder: arrow::array::PrimitiveBuilder<T>,
Expand Down
Loading