From 13a012ff41e96bf8fa841fcaa52357e732061e53 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 6 Feb 2018 01:08:47 -0800 Subject: [PATCH 1/2] [SPARK-23342][SQL][TEST] Add ORC configuration tests for ORC data source --- .../datasources/orc/OrcSourceSuite.scala | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 6f5f2fd795f74..32f3846840a9d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources.orc import java.io.File import java.util.Locale -import org.apache.orc.OrcConf.COMPRESS +import org.apache.orc.{OrcFile, Reader} +import org.apache.orc.OrcConf.{BUFFER_SIZE, COMPRESS, ROW_INDEX_STRIDE, STRIPE_SIZE} import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.Row @@ -160,6 +161,77 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } } + + private def getReader(path: String): Reader = { + val conf = spark.sessionState.newHadoopConf() + val files = OrcUtils.listOrcFiles(path, conf) + assert(files.length == 1) + val file = files.head + val fs = file.getFileSystem(conf) + val readerOptions = org.apache.orc.OrcFile.readerOptions(conf).filesystem(fs) + OrcFile.createReader(file, readerOptions) + } + + test("SPARK-23342 Support orc.stripe.size and hive.exec.orc.default.stripe.size") { + val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1) + + Seq(org.apache.orc.OrcConf.STRIPE_SIZE).foreach { conf => + Seq(conf.getAttribute, conf.getHiveConfName).foreach { name => + // Since the default value of orc.stripe.size is 64MB, there exists only 1 stripe. + withTempPath { path => + val dir = path.getCanonicalPath + df.write.format("orc").save(dir) + assert(getReader(dir).getStripes().size === 1) + } + + withTempPath { path => + val dir = path.getCanonicalPath + df.write.format("orc").option(name, "10000").save(dir) + assert(getReader(dir).getStripes().size > 100) + } + } + } + } + + test("SPARK-23342 Support orc.row.index.stride and hive.exec.orc.default.row.index.stride") { + val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1) + + Seq(ROW_INDEX_STRIDE).foreach { conf => + Seq(conf.getAttribute, conf.getHiveConfName).foreach { name => + withTempPath { path => + val dir = path.getCanonicalPath + df.write.format("orc").save(dir) + } + + withTempPath { path => + val dir = path.getCanonicalPath + df.write.format("orc").option(name, "1024").save(dir) + assert(getReader(dir).getRowIndexStride === 1024) + } + } + } + } + + test("SPARK-23342 Support orc.compress.size and hive.exec.orc.default.buffer.size") { + val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1) + + Seq(BUFFER_SIZE).foreach { conf => + Seq(conf.getAttribute, conf.getHiveConfName).foreach { name => + withTempPath { path => + val dir = path.getCanonicalPath + df.write.format("orc").save(dir) + assert(getReader(dir).getCompressionSize === BUFFER_SIZE.getDefaultValue) + } + + withTempPath { path => + val dir = path.getCanonicalPath + + df.write.format("orc").option(name, "1024").save(dir) + assert(getReader(dir).getCompressionSize === 1024) + } + } + } + } } class OrcSourceSuite extends OrcSuite with SharedSQLContext { From e51b2e7f295b2e5b6aef1147ee4ffd9a1bb2ea9a Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 6 Feb 2018 01:30:51 -0800 Subject: [PATCH 2/2] update --- .../sql/execution/datasources/orc/OrcSourceSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 32f3846840a9d..edd2cc12b8582 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -167,15 +167,14 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { val files = OrcUtils.listOrcFiles(path, conf) assert(files.length == 1) val file = files.head - val fs = file.getFileSystem(conf) - val readerOptions = org.apache.orc.OrcFile.readerOptions(conf).filesystem(fs) + val readerOptions = OrcFile.readerOptions(conf).filesystem(file.getFileSystem(conf)) OrcFile.createReader(file, readerOptions) } test("SPARK-23342 Support orc.stripe.size and hive.exec.orc.default.stripe.size") { val df = spark.range(1000000).map(_ => scala.util.Random.nextLong).repartition(1) - Seq(org.apache.orc.OrcConf.STRIPE_SIZE).foreach { conf => + Seq(STRIPE_SIZE).foreach { conf => Seq(conf.getAttribute, conf.getHiveConfName).foreach { name => // Since the default value of orc.stripe.size is 64MB, there exists only 1 stripe. withTempPath { path => @@ -201,6 +200,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { withTempPath { path => val dir = path.getCanonicalPath df.write.format("orc").save(dir) + assert(getReader(dir).getRowIndexStride === ROW_INDEX_STRIDE.getDefaultValue) } withTempPath { path =>