From 0c9b77a54f47dfd42ccd2454d63f272c5c7b4464 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Thu, 12 Nov 2015 18:03:37 +0900 Subject: [PATCH 1/5] [SPARK-11692][SQL] Support for Parquet logical types, JSON and BISON (embedded types) --- .../parquet/CatalystSchemaConverter.scala | 3 ++- .../datasources/parquet/ParquetIOSuite.scala | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala index 7f3394c20ed3d..16ffa077e5472 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala @@ -163,9 +163,10 @@ private[parquet] class CatalystSchemaConverter( case BINARY => originalType match { - case UTF8 | ENUM => StringType + case UTF8 | ENUM | JSON => StringType case null if assumeBinaryIsString => StringType case null => BinaryType + case BSON => BinaryType case DECIMAL => makeDecimalType() case _ => illegalType() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 72744799897be..5cf31c62663a1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -206,6 +206,29 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { } } + test("SPARK-11692 Support for unsigned Parquet logical types") { + val parquetSchema = MessageTypeParser.parseMessageType( + """message root { + | required binary a(JSON); + | required binary b(BSON); + |} + """.stripMargin) + + withTempPath { location => + val extraMetadata = Map.empty[String, String].asJava + val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark") + val path = new Path(location.getCanonicalPath) + val footer = List( + new Footer(path, new ParquetMetadata(fileMetadata, Collections.emptyList())) + ).asJava + + ParquetFileWriter.writeMetadataFile(sparkContext.hadoopConfiguration, path, footer) + val jsonDataType = sqlContext.read.parquet(path.toString).schema(0).dataType + assert(jsonDataType == StringType) + val bsonDataType = sqlContext.read.parquet(path.toString).schema(1).dataType + assert(bsonDataType == BinaryType) + } + } + test("compression codec") { def compressionCodecFor(path: String, codecName: String): String = { val codecs = for { From d5a962916fac248857711e60a9bd9c59668648df Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Thu, 12 Nov 2015 18:12:46 +0900 Subject: [PATCH 2/5] [SPARK-11692][SQL] Fix typos --- .../sql/execution/datasources/parquet/ParquetIOSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 5cf31c62663a1..6e119ac4ba3aa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -206,7 +206,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { } } - test("SPARK-11692 Support for unsigned Parquet logical types") { + test("SPARK-11692 Support for Parquet logical types, JSON and BSON (embedded types)") { val parquetSchema = MessageTypeParser.parseMessageType( """message root { | required binary a(JSON); From 66088e3c117382c26798355e11ce4904077d1dc9 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 12 Nov 2015 21:34:33 +0900 Subject: [PATCH 3/5] [SPARK-11694][SQL] Remove mistakenly added characters. --- .../sql/execution/datasources/parquet/ParquetIOSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 6e119ac4ba3aa..85a7b429c05e0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -211,7 +211,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { """message root { | required binary a(JSON); | required binary b(BSON); - |} + """.stripMargin) + |} + """.stripMargin) withTempPath { location => val extraMetadata = Map.empty[String, String].asJava From 9f22651a9c9f94369946c323332512797b09cff0 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 12 Nov 2015 21:42:25 +0900 Subject: [PATCH 4/5] [SPARK-11692][SQL] Update indentation --- .../sql/execution/datasources/parquet/ParquetIOSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 85a7b429c05e0..3ff2230f1e903 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -212,7 +212,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { | required binary a(JSON); | required binary b(BSON); |} - """.stripMargin) + """.stripMargin) withTempPath { location => val extraMetadata = Map.empty[String, String].asJava From 11526361eeedd6f30035724a8d600918f71cab23 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 16 Nov 2015 09:21:54 +0900 Subject: [PATCH 5/5] [SPARK-11692][SQL] Use === instead of == for better assertion error messages --- .../sql/execution/datasources/parquet/ParquetIOSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 97f8b2004833c..3a9051d6f1b23 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -249,9 +249,9 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext { ParquetFileWriter.writeMetadataFile(sparkContext.hadoopConfiguration, path, footer) val jsonDataType = sqlContext.read.parquet(path.toString).schema(0).dataType - assert(jsonDataType == StringType) + assert(jsonDataType === StringType) val bsonDataType = sqlContext.read.parquet(path.toString).schema(1).dataType - assert(bsonDataType == BinaryType) + assert(bsonDataType === BinaryType) } }