Skip to content
64 changes: 61 additions & 3 deletions parquet-variant-compute/src/type_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

//! Module for transforming a typed arrow `Array` to `VariantArray`.

use arrow::datatypes::{self, ArrowPrimitiveType};
use arrow::datatypes::{self, ArrowPrimitiveType, ArrowTimestampType, Date32Type};
use chrono::{DateTime, Utc};
use parquet_variant::Variant;

/// Options for controlling the behavior of `cast_to_variant_with_options`.
Expand All @@ -38,12 +39,40 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
}

/// Extension trait for Arrow timestamp types that can extract their native value from a Variant
/// We can't use [`PrimitiveFromVariant`] directly because we need _two_ implementations for each
/// timestamp type -- the `NTZ` param here.
pub(crate) trait TimestampFromVariant<const NTZ: bool>: ArrowTimestampType {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
}

/// Extension trait that `ArrowTimestampType` handle `DateTime<Utc>` like `NaiveDateTime`
trait MakeValueTz: ArrowTimestampType {
fn make_value_tz(timestamp: DateTime<Utc>) -> Option<i64> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: this name is fine, but because it's a different trait we should also be able to "overload" make_value if you want.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for didn't add a comment for this, use make_value_tz here because it can't compile if I try to "overload" make_value, the compiler needs me to convert type to a specific type in MakeValueTz::make_value and macro impl_timestamp_from_variant(something like <Self as ArrowTimestampType>::make_value(...) and <Self as MakeValueTz>::make_value(...). and googled that seems Rust didn't support "overload" with same func name and different parameters. not sure if I missed anything here

Self::make_value(timestamp.naive_utc())
}
}

impl<T: ArrowTimestampType> MakeValueTz for T {}

/// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types
macro_rules! impl_primitive_from_variant {
($arrow_type:ty, $variant_method:ident) => {
($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
impl PrimitiveFromVariant for $arrow_type {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
variant.$variant_method()
let value = variant.$variant_method();
$( let value = value.map($cast_fn); )?
value
}
}
};
}

macro_rules! impl_timestamp_from_variant {
($timestamp_type:ty, $variant_method:ident, ntz=$ntz:ident, $cast_fn:expr $(,)?) => {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this change, I removed the micro -> nano, because I'm not sure how to define the macro_rules here,

the macro rule in my head now is something like below

 ($timestamp_type:ty, $variant_method_a:ident, $(opt=$variant_method_b:ident,)? ntz=$ntz:ident, $cast_fn:expr $(,)?) => {
        impl TimestampFromVariant<{ $ntz }> for $timestamp_type {
            fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
                let value = variant.$variant_method_a();
                if value.is_some() {
                    return value.and_then($cast_fn);
                } else {
                    #[allow(unused_mut)]
                    let mut value = None;
                    $(
                        value = variant.$variant_method_b();
                    )?
                    return value.and_then($cast_fn);
                }
            }
        }
    };

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The micro -> nano conversion actually happens inside as_timestamp_[ntz_]nanos, I left a comment there about it.

impl TimestampFromVariant<{ $ntz }> for $timestamp_type {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
variant.$variant_method().and_then($cast_fn)
}
}
};
Expand All @@ -60,6 +89,35 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64);
impl_primitive_from_variant!(datatypes::Float16Type, as_f16);
impl_primitive_from_variant!(datatypes::Float32Type, as_f32);
impl_primitive_from_variant!(datatypes::Float64Type, as_f64);
impl_primitive_from_variant!(
datatypes::Date32Type,
as_naive_date,
Date32Type::from_naive_date
);
impl_timestamp_from_variant!(
datatypes::TimestampMicrosecondType,
as_timestamp_ntz_micros,
ntz = true,
Self::make_value,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here use Self::make_value instread of make_value because I think Self::make_value is the whole function name, can change if needed

);
impl_timestamp_from_variant!(
datatypes::TimestampMicrosecondType,
as_timestamp_micros,
ntz = false,
Self::make_value_tz
);
impl_timestamp_from_variant!(
datatypes::TimestampNanosecondType,
as_timestamp_ntz_nanos,
ntz = true,
Self::make_value
);
impl_timestamp_from_variant!(
datatypes::TimestampNanosecondType,
as_timestamp_nanos,
ntz = false,
Self::make_value_tz
);

/// Convert the value at a specific index in the given array into a `Variant`.
macro_rules! non_generic_conversion_single_value {
Expand Down
182 changes: 160 additions & 22 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,14 @@ mod test {
use crate::json_to_variant;
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use arrow::array::{
Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array,
Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray,
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array,
Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray,
};
use arrow::buffer::NullBuffer;
use arrow::compute::CastOptions;
use arrow::datatypes::DataType::{Int16, Int32, Int64};
use arrow_schema::{DataType, Field, FieldRef, Fields};
use arrow_schema::DataType::{Boolean, Float32, Float64, Int8};
use arrow_schema::{DataType, Field, FieldRef, Fields, TimeUnit};
use chrono::DateTime;
use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantPath};

Expand Down Expand Up @@ -700,7 +701,7 @@ mod test {
}

macro_rules! perfectly_shredded_to_arrow_primitive_test {
($name:ident, $primitive_type:ident, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
($name:ident, $primitive_type:expr, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
#[test]
fn $name() {
let array = $perfectly_shredded_array_gen_fun();
Expand All @@ -713,6 +714,13 @@ mod test {
};
}

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_int18_as_int8,
Int8,
perfectly_shredded_int8_variant_array,
Int8Array::from(vec![Some(1), Some(2), Some(3)])
);

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_int16_as_int16,
Int16,
Expand All @@ -734,31 +742,37 @@ mod test {
Int64Array::from(vec![Some(1), Some(2), Some(3)])
);

/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the given typed value.
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// typed_value: Int32Array,
/// }
/// ```
macro_rules! numeric_perfectly_shredded_variant_array_fn {
($func:ident, $array_type:ident, $primitive_type:ty) => {
perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_float32_as_float32,
Float32,
perfectly_shredded_float32_variant_array,
Float32Array::from(vec![Some(1.0), Some(2.0), Some(3.0)])
);

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_float64_as_float64,
Float64,
perfectly_shredded_float64_variant_array,
Float64Array::from(vec![Some(1.0), Some(2.0), Some(3.0)])
);

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_boolean_as_boolean,
Boolean,
perfectly_shredded_bool_variant_array,
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
);

macro_rules! perfectly_shredded_variant_array_fn {
($func:ident, $typed_value_gen:expr) => {
fn $func() -> ArrayRef {
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
3,
));
let typed_value = $array_type::from(vec![
Some(<$primitive_type>::try_from(1u8).unwrap()),
Some(<$primitive_type>::try_from(2u8).unwrap()),
Some(<$primitive_type>::try_from(3u8).unwrap()),
]);
let typed_value = $typed_value_gen();

let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata), false)
Expand All @@ -772,6 +786,33 @@ mod test {
};
}

perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || {
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
});

/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the given typed value.
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// typed_value: Int32Array,
/// }
/// ```
macro_rules! numeric_perfectly_shredded_variant_array_fn {
($func:ident, $array_type:ident, $primitive_type:ty) => {
perfectly_shredded_variant_array_fn!($func, || {
$array_type::from(vec![
Some(<$primitive_type>::try_from(1u8).unwrap()),
Some(<$primitive_type>::try_from(2u8).unwrap()),
Some(<$primitive_type>::try_from(3u8).unwrap()),
])
});
};
}

numeric_perfectly_shredded_variant_array_fn!(
perfectly_shredded_int8_variant_array,
Int8Array,
Expand Down Expand Up @@ -803,6 +844,103 @@ mod test {
f64
);

perfectly_shredded_variant_array_fn!(
perfectly_shredded_timestamp_micro_ntz_variant_array,
|| {
arrow::array::TimestampMicrosecondArray::from(vec![
Some(-456000),
Some(1758602096000001),
Some(1758602096000002),
])
}
);

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_micro_ntz,
DataType::Timestamp(TimeUnit::Microsecond, None),
perfectly_shredded_timestamp_micro_ntz_variant_array,
arrow::array::TimestampMicrosecondArray::from(vec![
Some(-456000),
Some(1758602096000001),
Some(1758602096000002),
])
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || {
arrow::array::TimestampMicrosecondArray::from(vec![
Some(-456000),
Some(1758602096000001),
Some(1758602096000002),
])
.with_timezone("+00:00")
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_micro,
DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))),
perfectly_shredded_timestamp_micro_variant_array,
arrow::array::TimestampMicrosecondArray::from(vec![
Some(-456000),
Some(1758602096000001),
Some(1758602096000002),
])
.with_timezone("+00:00")
);

perfectly_shredded_variant_array_fn!(
perfectly_shredded_timestamp_nano_ntz_variant_array,
|| {
arrow::array::TimestampNanosecondArray::from(vec![
Some(-4999999561),
Some(1758602096000000001),
Some(1758602096000000002),
])
}
);

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz,
DataType::Timestamp(TimeUnit::Nanosecond, None),
perfectly_shredded_timestamp_nano_ntz_variant_array,
arrow::array::TimestampNanosecondArray::from(vec![
Some(-4999999561),
Some(1758602096000000001),
Some(1758602096000000002),
])
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_nano_variant_array, || {
arrow::array::TimestampNanosecondArray::from(vec![
Some(-4999999561),
Some(1758602096000000001),
Some(1758602096000000002),
])
.with_timezone("+00:00")
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_nano,
DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))),
perfectly_shredded_timestamp_nano_variant_array,
arrow::array::TimestampNanosecondArray::from(vec![
Some(-4999999561),
Some(1758602096000000001),
Some(1758602096000000002),
])
.with_timezone("+00:00")
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_date_variant_array, || {
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_date_as_date,
DataType::Date32,
perfectly_shredded_date_variant_array,
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
);

macro_rules! assert_variant_get_as_variant_array_with_default_option {
($variant_array: expr, $array_expected: expr) => {{
let options = GetOptions::new();
Expand Down
Loading
Loading