Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-11365: [Rust] [Parquet] Logical type printer and parser #9705

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 40 additions & 11 deletions rust/parquet/src/arrow/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -942,11 +942,14 @@ mod tests {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 uint8 (INTEGER(8,false));
REQUIRED INT32 uint16 (INTEGER(16,false));
REQUIRED INT32 int32;
REQUIRED INT64 int64 ;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string_2 (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand All @@ -959,11 +962,14 @@ mod tests {
Field::new("boolean", DataType::Boolean, false),
Field::new("int8", DataType::Int8, false),
Field::new("int16", DataType::Int16, false),
Field::new("uint8", DataType::UInt8, false),
Field::new("uint16", DataType::UInt16, false),
Field::new("int32", DataType::Int32, false),
Field::new("int64", DataType::Int64, false),
Field::new("double", DataType::Float64, true),
Field::new("float", DataType::Float32, true),
Field::new("string", DataType::Utf8, true),
Field::new("string_2", DataType::Utf8, true),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it cool / needed that UTF8 and STRING both map to Arrow DataType::Utf8? It seems like UTF8 is not actually a valid "logical type" in parquet -- it should be STRING

https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string-types

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where things get interesting. UTF8 maps to the converted type, and STRING to the logical type. So, the pecking order is to check for logical type, then fall back to converted type.

It gets tricky when the logical and converted types are the same string value, but that is fine as the converted type is always written out. Q good example is DECIMAL(12,2), it is the same in either logical or converted type.

I was confused with this, as I initially tried parsing logical and converted types separately, without mixing them in one file. After @sunchao's review on the other PRs, it started to make sense that they can coexist.

So, both map to the same Arrow type

];

assert_eq!(&arrow_fields, converted_arrow_schema.fields());
Expand Down Expand Up @@ -1508,18 +1514,22 @@ mod tests {
message test_schema {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 uint8 (INTEGER(8,false));
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 uint16 (INTEGER(16,false));
REQUIRED INT32 int32;
REQUIRED INT64 int64 ;
REQUIRED INT64 int64;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
REPEATED BOOLEAN bools;
OPTIONAL INT32 date (DATE);
OPTIONAL INT32 time_milli (TIME_MILLIS);
OPTIONAL INT64 time_micro (TIME_MICROS);
OPTIONAL INT64 time_nano (TIME(NANOS,false));
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true));
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand All @@ -1534,7 +1544,9 @@ mod tests {
let arrow_fields = vec![
Field::new("boolean", DataType::Boolean, false),
Field::new("int8", DataType::Int8, false),
Field::new("uint8", DataType::UInt8, false),
Field::new("int16", DataType::Int16, false),
Field::new("uint16", DataType::UInt16, false),
Field::new("int32", DataType::Int32, false),
Field::new("int64", DataType::Int64, false),
Field::new("double", DataType::Float64, true),
Expand All @@ -1548,6 +1560,7 @@ mod tests {
Field::new("date", DataType::Date32, true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true),
Field::new(
"ts_milli",
DataType::Timestamp(TimeUnit::Millisecond, None),
Expand All @@ -1558,24 +1571,28 @@ mod tests {
DataType::Timestamp(TimeUnit::Microsecond, None),
false,
),
Field::new(
"ts_nano",
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())),
false,
),
];

assert_eq!(arrow_fields, converted_arrow_fields);
}

#[test]
#[ignore = "To be addressed as part of ARROW-11365"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉

fn test_field_to_column_desc() {
let message_type = "
message arrow_schema {
REQUIRED BOOLEAN boolean;
REQUIRED INT32 int8 (INT_8);
REQUIRED INT32 int16 (INT_16);
REQUIRED INT32 int16 (INTEGER(16,true));
REQUIRED INT32 int32;
REQUIRED INT64 int64;
OPTIONAL DOUBLE double;
OPTIONAL FLOAT float;
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string (STRING);
OPTIONAL GROUP bools (LIST) {
REPEATED GROUP list {
OPTIONAL BOOLEAN element;
Expand All @@ -1587,20 +1604,20 @@ mod tests {
}
}
OPTIONAL INT32 date (DATE);
OPTIONAL INT32 time_milli (TIME_MILLIS);
OPTIONAL INT32 time_milli (TIME(MILLIS,false));
OPTIONAL INT64 time_micro (TIME_MICROS);
OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false));
REQUIRED GROUP struct {
REQUIRED BOOLEAN bools;
REQUIRED INT32 uint32 (UINT_32);
REQUIRED INT32 uint32 (INTEGER(32,false));
REQUIRED GROUP int32 (LIST) {
REPEATED GROUP list {
OPTIONAL INT32 element;
}
}
}
REQUIRED BINARY dictionary_strings (UTF8);
REQUIRED BINARY dictionary_strings (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand Down Expand Up @@ -1674,8 +1691,20 @@ mod tests {
.iter()
.zip(converted_arrow_schema.columns())
.for_each(|(a, b)| {
// TODO: ARROW-11365: If parsing v1 format, there should be no logical type
assert_eq!(a, b);
// Only check logical type if it's set on the Parquet side.
// This is because the Arrow conversion always sets logical type,
// even if there wasn't originally one.
// This is not an issue, but is an inconvenience for this test.
match a.logical_type() {
Some(_) => {
assert_eq!(a, b)
}
None => {
assert_eq!(a.name(), b.name());
assert_eq!(a.physical_type(), b.physical_type());
assert_eq!(a.converted_type(), b.converted_type());
}
};
});
}

Expand All @@ -1694,7 +1723,7 @@ mod tests {
fn test_metadata() {
let message_type = "
message test_schema {
OPTIONAL BINARY string (UTF8);
OPTIONAL BINARY string (STRING);
}
";
let parquet_group_type = parse_message_type(message_type).unwrap();
Expand Down
73 changes: 8 additions & 65 deletions rust/parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -847,84 +847,27 @@ impl str::FromStr for LogicalType {

fn from_str(s: &str) -> result::Result<Self, Self::Err> {
match s {
"INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType {
// The type is a placeholder that gets updated elsewhere
"INTEGER" => Ok(LogicalType::INTEGER(IntType {
bit_width: 8,
is_signed: true,
})),
"INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 16,
is_signed: true,
})),
"INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 32,
is_signed: true,
})),
"INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 64,
is_signed: true,
})),
"INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 8,
is_signed: false,
})),
"INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 16,
is_signed: false,
})),
"INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 32,
is_signed: false,
})),
"INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType {
bit_width: 64,
is_signed: false,
})),
"MAP" => Ok(LogicalType::MAP(MapType {})),
"LIST" => Ok(LogicalType::LIST(ListType {})),
"ENUM" => Ok(LogicalType::ENUM(EnumType {})),
// TODO: ARROW-11365
// "DECIMAL" => Ok(LogicalType::DECIMAL),
"DATE" => Ok(LogicalType::DATE(DateType {})),
"TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
"DECIMAL" => Ok(LogicalType::DECIMAL(DecimalType {
precision: -1,
scale: -1,
})),
"TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType {
"DATE" => Ok(LogicalType::DATE(DateType {})),
"TIME" => Ok(LogicalType::TIME(TimeType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
"TIMESTAMP" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MILLIS(parquet::MilliSeconds {}),
})),
"TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: true,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType {
is_adjusted_to_u_t_c: false,
unit: TimeUnit::MICROS(parquet::MicroSeconds {}),
})),
"STRING" => Ok(LogicalType::STRING(StringType {})),
"JSON" => Ok(LogicalType::JSON(JsonType {})),
"BSON" => Ok(LogicalType::BSON(BsonType {})),
Expand Down