Skip to content

Commit

Permalink
Fixes Parquet schema of arrays that may contain null
Browse files Browse the repository at this point in the history
  • Loading branch information
liancheng committed Sep 3, 2015
1 parent 391e6be commit 4e57d11
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,14 @@ private[parquet] class CatalystSchemaConverter(
// ArrayType and MapType (for Spark versions <= 1.4.x)
// ===================================================

// Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
// LIST structure. This behavior mimics parquet-hive (1.6.0rc3). Note that this case is
// covered by the backwards-compatibility rules implemented in `isElementType()`.
// Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
// `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro
// (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
// field name "array" is borrowed from parquet-avro.
case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
// <list-repetition> group <name> (LIST) {
// optional group bag {
// repeated <element-type> element;
// repeated <element-type> array;
// }
// }
ConversionPatterns.listType(
Expand All @@ -441,8 +442,8 @@ private[parquet] class CatalystSchemaConverter(
Types
.buildGroup(REPEATED)
// "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
.addField(convertField(StructField("array_element", elementType, nullable)))
.named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
.addField(convertField(StructField("array", elementType, nullable)))
.named("bag"))

// Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
// LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
|message root {
| optional group _1 (LIST) {
| repeated group bag {
| optional int32 array_element;
| optional int32 array;
| }
| }
|}
Expand Down Expand Up @@ -266,7 +266,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
| optional binary _1 (UTF8);
| optional group _2 (LIST) {
| repeated group bag {
| optional group array_element {
| optional group array {
| required int32 _1;
| required double _2;
| }
Expand Down Expand Up @@ -645,7 +645,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
"""message root {
| optional group f1 (LIST) {
| repeated group bag {
| optional int32 array_element;
| optional int32 array;
| }
| }
|}
Expand Down

0 comments on commit 4e57d11

Please sign in to comment.