From 07e20639b7023fcc61c73f80a5bddf8715c2a06f Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 10 Feb 2023 03:59:42 -0800 Subject: [PATCH] Support UTF8 cast to Timestamp with timezone (#3673) * support cast UTf8 to Timestamp * fmt * fix docs and tests --- arrow-cast/src/cast.rs | 46 +++++++++++++++++++++++++++++++++++---- arrow-cast/src/parse.rs | 21 ++++++++++++++---- arrow/tests/array_cast.rs | 4 ++-- 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3137e685b21..1631f2e0040 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -166,7 +166,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Nanosecond, None) + | Timestamp(TimeUnit::Nanosecond, _) ) => true, (Utf8, _) => to_type.is_numeric() && to_type != &Float16, (LargeUtf8, @@ -179,7 +179,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Nanosecond, None) + | Timestamp(TimeUnit::Nanosecond, _) ) => true, (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1141,7 +1141,7 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Nanosecond, None) => { + Timestamp(TimeUnit::Nanosecond, _) => { cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( @@ -1182,7 +1182,7 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Nanosecond, None) => { + Timestamp(TimeUnit::Nanosecond, _) => { cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( @@ -7550,4 +7550,42 @@ mod tests { assert_eq!(v.value(0), 946728000000); assert_eq!(v.value(1), 1608035696000); } + + #[test] + fn test_cast_utf8_to_timestamp() { + fn test_tz(tz: String) { + let valid = StringArray::from(vec![ + "2023-01-01 04:05:06.789000-08:00", + "2023-01-01 04:05:06.789000-07:00", + "2023-01-01 04:05:06.789 -0800", + "2023-01-01 04:05:06.789 -08:00", + "2023-01-01 040506 +0730", + "2023-01-01 040506 +07:30", + "2023-01-01 04:05:06.789", + "2023-01-01 04:05:06", + "2023-01-01", + ]); + + let array = Arc::new(valid) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))) + .unwrap(); + + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1672574706789000000, c.value(0)); + assert_eq!(1672571106789000000, c.value(1)); + assert_eq!(1672574706789000000, c.value(2)); + assert_eq!(1672574706789000000, c.value(3)); + assert_eq!(1672518906000000000, c.value(4)); + assert_eq!(1672518906000000000, c.value(5)); + assert_eq!(1672545906789000000, c.value(6)); + assert_eq!(1672545906000000000, c.value(7)); + assert_eq!(1672531200000000000, c.value(8)); + } + + test_tz("+00:00".to_owned()); + test_tz("+02:00".to_owned()); + } } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 459b94f37dc..f23e65b2284 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -68,6 +68,14 @@ use chrono::prelude::*; /// the system timezone is set to Americas/New_York (UTC-5) the /// timestamp will be interpreted as though it were /// `1997-01-31T09:26:56.123-05:00` +/// +/// Some formats that supported by PostgresSql +/// still not supported by chrono, like +/// "2023-01-01 040506 America/Los_Angeles", +/// "2023-01-01 04:05:06.789 +07:30:00", +/// "2023-01-01 040506 +07:30:00", +/// "2023-01-01 04:05:06.789 PST", +/// "2023-01-01 04:05:06.789 -08", #[inline] pub fn string_to_timestamp_nanos(s: &str) -> Result { // Fast path: RFC3339 timestamp (with a T) @@ -81,10 +89,15 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // separating the date and time with a space ' ' rather than 'T' to be // (more) compatible with Apache Spark SQL - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return to_timestamp_nanos(ts.naive_utc()); + let supported_formats = vec![ + "%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00 + "%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30" + ]; + + for f in supported_formats.iter() { + if let Ok(ts) = DateTime::parse_from_str(s, f) { + return to_timestamp_nanos(ts.naive_utc()); + } } // with an explicit Z, using ' ' as a separator diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 7eeb00a8290..30ded4d70be 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -97,7 +97,7 @@ fn test_can_cast_types() { /// Create instances of arrays with varying types for cast tests fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); + let tz_name = String::from("+08:00"); let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; vec![ Arc::new(BinaryArray::from(binary_data.clone())), @@ -349,7 +349,7 @@ fn create_decimal_array( // Get a selection of datatypes to try and cast to fn get_all_types() -> Vec { use DataType::*; - let tz_name = String::from("America/New_York"); + let tz_name = String::from("+08:00"); let mut types = vec![ Null,