Skip to content

Commit

Permalink
Support UTF8 cast to Timestamp with timezone (#3673)
Browse files Browse the repository at this point in the history
* support cast UTf8 to Timestamp

* fmt

* fix docs and tests
  • Loading branch information
comphead committed Feb 10, 2023
1 parent bb4fc59 commit 07e2063
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 10 deletions.
46 changes: 42 additions & 4 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
| Timestamp(TimeUnit::Nanosecond, None)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
(Utf8, _) => to_type.is_numeric() && to_type != &Float16,
(LargeUtf8,
Expand All @@ -179,7 +179,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
| Timestamp(TimeUnit::Nanosecond, None)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
(LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
Expand Down Expand Up @@ -1141,7 +1141,7 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i32>(array, cast_options)
}
Timestamp(TimeUnit::Nanosecond, None) => {
Timestamp(TimeUnit::Nanosecond, _) => {
cast_string_to_timestamp_ns::<i32>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
Expand Down Expand Up @@ -1182,7 +1182,7 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i64>(array, cast_options)
}
Timestamp(TimeUnit::Nanosecond, None) => {
Timestamp(TimeUnit::Nanosecond, _) => {
cast_string_to_timestamp_ns::<i64>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
Expand Down Expand Up @@ -7550,4 +7550,42 @@ mod tests {
assert_eq!(v.value(0), 946728000000);
assert_eq!(v.value(1), 1608035696000);
}

#[test]
fn test_cast_utf8_to_timestamp() {
fn test_tz(tz: String) {
let valid = StringArray::from(vec![
"2023-01-01 04:05:06.789000-08:00",
"2023-01-01 04:05:06.789000-07:00",
"2023-01-01 04:05:06.789 -0800",
"2023-01-01 04:05:06.789 -08:00",
"2023-01-01 040506 +0730",
"2023-01-01 040506 +07:30",
"2023-01-01 04:05:06.789",
"2023-01-01 04:05:06",
"2023-01-01",
]);

let array = Arc::new(valid) as ArrayRef;
let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)))
.unwrap();

let c = b
.as_any()
.downcast_ref::<TimestampNanosecondArray>()
.unwrap();
assert_eq!(1672574706789000000, c.value(0));
assert_eq!(1672571106789000000, c.value(1));
assert_eq!(1672574706789000000, c.value(2));
assert_eq!(1672574706789000000, c.value(3));
assert_eq!(1672518906000000000, c.value(4));
assert_eq!(1672518906000000000, c.value(5));
assert_eq!(1672545906789000000, c.value(6));
assert_eq!(1672545906000000000, c.value(7));
assert_eq!(1672531200000000000, c.value(8));
}

test_tz("+00:00".to_owned());
test_tz("+02:00".to_owned());
}
}
21 changes: 17 additions & 4 deletions arrow-cast/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ use chrono::prelude::*;
/// the system timezone is set to Americas/New_York (UTC-5) the
/// timestamp will be interpreted as though it were
/// `1997-01-31T09:26:56.123-05:00`
///
/// Some formats that supported by PostgresSql <https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
/// still not supported by chrono, like
/// "2023-01-01 040506 America/Los_Angeles",
/// "2023-01-01 04:05:06.789 +07:30:00",
/// "2023-01-01 040506 +07:30:00",
/// "2023-01-01 04:05:06.789 PST",
/// "2023-01-01 04:05:06.789 -08",
#[inline]
pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
// Fast path: RFC3339 timestamp (with a T)
Expand All @@ -81,10 +89,15 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
// separating the date and time with a space ' ' rather than 'T' to be
// (more) compatible with Apache Spark SQL

// timezone offset, using ' ' as a separator
// Example: 2020-09-08 13:42:29.190855-05:00
if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
return to_timestamp_nanos(ts.naive_utc());
let supported_formats = vec![
"%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00
"%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30"
];

for f in supported_formats.iter() {
if let Ok(ts) = DateTime::parse_from_str(s, f) {
return to_timestamp_nanos(ts.naive_utc());
}
}

// with an explicit Z, using ' ' as a separator
Expand Down
4 changes: 2 additions & 2 deletions arrow/tests/array_cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ fn test_can_cast_types() {

/// Create instances of arrays with varying types for cast tests
fn get_arrays_of_all_types() -> Vec<ArrayRef> {
let tz_name = String::from("America/New_York");
let tz_name = String::from("+08:00");
let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"];
vec![
Arc::new(BinaryArray::from(binary_data.clone())),
Expand Down Expand Up @@ -349,7 +349,7 @@ fn create_decimal_array(
// Get a selection of datatypes to try and cast to
fn get_all_types() -> Vec<DataType> {
use DataType::*;
let tz_name = String::from("America/New_York");
let tz_name = String::from("+08:00");

let mut types = vec![
Null,
Expand Down

0 comments on commit 07e2063

Please sign in to comment.