-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-11365: [Rust] [Parquet] Logical type printer and parser #9705
Closed
Closed
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
f42337e
use logical types to map to Arrow types
nevi-me 5b8578f
address review comments
nevi-me b4eb0de
schema printer for logical types
nevi-me 117b26a
implement logical type parser
nevi-me ffab87f
test logical types, remove TODO
nevi-me 6aaf4fa
address review comments
nevi-me File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -942,11 +942,14 @@ mod tests { | |
REQUIRED BOOLEAN boolean; | ||
REQUIRED INT32 int8 (INT_8); | ||
REQUIRED INT32 int16 (INT_16); | ||
REQUIRED INT32 uint8 (INTEGER(8,false)); | ||
REQUIRED INT32 uint16 (INTEGER(16,false)); | ||
REQUIRED INT32 int32; | ||
REQUIRED INT64 int64 ; | ||
OPTIONAL DOUBLE double; | ||
OPTIONAL FLOAT float; | ||
OPTIONAL BINARY string (UTF8); | ||
OPTIONAL BINARY string_2 (STRING); | ||
} | ||
"; | ||
let parquet_group_type = parse_message_type(message_type).unwrap(); | ||
|
@@ -959,11 +962,14 @@ mod tests { | |
Field::new("boolean", DataType::Boolean, false), | ||
Field::new("int8", DataType::Int8, false), | ||
Field::new("int16", DataType::Int16, false), | ||
Field::new("uint8", DataType::UInt8, false), | ||
Field::new("uint16", DataType::UInt16, false), | ||
Field::new("int32", DataType::Int32, false), | ||
Field::new("int64", DataType::Int64, false), | ||
Field::new("double", DataType::Float64, true), | ||
Field::new("float", DataType::Float32, true), | ||
Field::new("string", DataType::Utf8, true), | ||
Field::new("string_2", DataType::Utf8, true), | ||
]; | ||
|
||
assert_eq!(&arrow_fields, converted_arrow_schema.fields()); | ||
|
@@ -1508,18 +1514,22 @@ mod tests { | |
message test_schema { | ||
REQUIRED BOOLEAN boolean; | ||
REQUIRED INT32 int8 (INT_8); | ||
REQUIRED INT32 uint8 (INTEGER(8,false)); | ||
REQUIRED INT32 int16 (INT_16); | ||
REQUIRED INT32 uint16 (INTEGER(16,false)); | ||
REQUIRED INT32 int32; | ||
REQUIRED INT64 int64 ; | ||
REQUIRED INT64 int64; | ||
OPTIONAL DOUBLE double; | ||
OPTIONAL FLOAT float; | ||
OPTIONAL BINARY string (UTF8); | ||
REPEATED BOOLEAN bools; | ||
OPTIONAL INT32 date (DATE); | ||
OPTIONAL INT32 time_milli (TIME_MILLIS); | ||
OPTIONAL INT64 time_micro (TIME_MICROS); | ||
OPTIONAL INT64 time_nano (TIME(NANOS,false)); | ||
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS); | ||
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS); | ||
REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true)); | ||
} | ||
"; | ||
let parquet_group_type = parse_message_type(message_type).unwrap(); | ||
|
@@ -1534,7 +1544,9 @@ mod tests { | |
let arrow_fields = vec![ | ||
Field::new("boolean", DataType::Boolean, false), | ||
Field::new("int8", DataType::Int8, false), | ||
Field::new("uint8", DataType::UInt8, false), | ||
Field::new("int16", DataType::Int16, false), | ||
Field::new("uint16", DataType::UInt16, false), | ||
Field::new("int32", DataType::Int32, false), | ||
Field::new("int64", DataType::Int64, false), | ||
Field::new("double", DataType::Float64, true), | ||
|
@@ -1548,6 +1560,7 @@ mod tests { | |
Field::new("date", DataType::Date32, true), | ||
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true), | ||
Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true), | ||
Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true), | ||
Field::new( | ||
"ts_milli", | ||
DataType::Timestamp(TimeUnit::Millisecond, None), | ||
|
@@ -1558,24 +1571,28 @@ mod tests { | |
DataType::Timestamp(TimeUnit::Microsecond, None), | ||
false, | ||
), | ||
Field::new( | ||
"ts_nano", | ||
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())), | ||
false, | ||
), | ||
]; | ||
|
||
assert_eq!(arrow_fields, converted_arrow_fields); | ||
} | ||
|
||
#[test] | ||
#[ignore = "To be addressed as part of ARROW-11365"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🎉 |
||
fn test_field_to_column_desc() { | ||
let message_type = " | ||
message arrow_schema { | ||
REQUIRED BOOLEAN boolean; | ||
REQUIRED INT32 int8 (INT_8); | ||
REQUIRED INT32 int16 (INT_16); | ||
REQUIRED INT32 int16 (INTEGER(16,true)); | ||
REQUIRED INT32 int32; | ||
REQUIRED INT64 int64; | ||
OPTIONAL DOUBLE double; | ||
OPTIONAL FLOAT float; | ||
OPTIONAL BINARY string (UTF8); | ||
OPTIONAL BINARY string (STRING); | ||
OPTIONAL GROUP bools (LIST) { | ||
REPEATED GROUP list { | ||
OPTIONAL BOOLEAN element; | ||
|
@@ -1587,20 +1604,20 @@ mod tests { | |
} | ||
} | ||
OPTIONAL INT32 date (DATE); | ||
OPTIONAL INT32 time_milli (TIME_MILLIS); | ||
OPTIONAL INT32 time_milli (TIME(MILLIS,false)); | ||
OPTIONAL INT64 time_micro (TIME_MICROS); | ||
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS); | ||
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS); | ||
REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false)); | ||
REQUIRED GROUP struct { | ||
REQUIRED BOOLEAN bools; | ||
REQUIRED INT32 uint32 (UINT_32); | ||
REQUIRED INT32 uint32 (INTEGER(32,false)); | ||
REQUIRED GROUP int32 (LIST) { | ||
REPEATED GROUP list { | ||
OPTIONAL INT32 element; | ||
} | ||
} | ||
} | ||
REQUIRED BINARY dictionary_strings (UTF8); | ||
REQUIRED BINARY dictionary_strings (STRING); | ||
} | ||
"; | ||
let parquet_group_type = parse_message_type(message_type).unwrap(); | ||
|
@@ -1674,8 +1691,20 @@ mod tests { | |
.iter() | ||
.zip(converted_arrow_schema.columns()) | ||
.for_each(|(a, b)| { | ||
// TODO: ARROW-11365: If parsing v1 format, there should be no logical type | ||
assert_eq!(a, b); | ||
// Only check logical type if it's set on the Parquet side. | ||
// This is because the Arrow conversion always sets logical type, | ||
// even if there wasn't originally one. | ||
// This is not an issue, but is an inconvenience for this test. | ||
match a.logical_type() { | ||
Some(_) => { | ||
assert_eq!(a, b) | ||
} | ||
None => { | ||
assert_eq!(a.name(), b.name()); | ||
assert_eq!(a.physical_type(), b.physical_type()); | ||
assert_eq!(a.converted_type(), b.converted_type()); | ||
} | ||
}; | ||
}); | ||
} | ||
|
||
|
@@ -1694,7 +1723,7 @@ mod tests { | |
fn test_metadata() { | ||
let message_type = " | ||
message test_schema { | ||
OPTIONAL BINARY string (UTF8); | ||
OPTIONAL BINARY string (STRING); | ||
} | ||
"; | ||
let parquet_group_type = parse_message_type(message_type).unwrap(); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is it cool / needed that
UTF8
andSTRING
both map to ArrowDataType::Utf8
? It seems likeUTF8
is not actually a valid "logical type" in parquet -- it should beSTRING
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string-types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is where things get interesting. UTF8 maps to the converted type, and STRING to the logical type. So, the pecking order is to check for logical type, then fall back to converted type.
It gets tricky when the logical and converted types are the same string value, but that is fine as the converted type is always written out. Q good example is DECIMAL(12,2), it is the same in either logical or converted type.
I was confused with this, as I initially tried parsing logical and converted types separately, without mixing them in one file. After @sunchao's review on the other PRs, it started to make sense that they can coexist.
So, both map to the same Arrow type