-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-37728][SQL][3.2] Reading nested columns with ORC vectorized reader can cause ArrayIndexOutOfBoundsException #35038
Changes from 7 commits
3403ca4
1708b68
6dd4f46
de4fd40
62b98ce
5f2e85d
be7a155
294b02b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,9 @@ import org.apache.orc.mapreduce.OrcInputFormat | |
import org.apache.spark.{SparkConf, SparkException} | ||
import org.apache.spark.sql._ | ||
import org.apache.spark.sql.catalyst.TableIdentifier | ||
import org.apache.spark.sql.execution.FileSourceScanExec | ||
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator} | ||
import org.apache.spark.sql.execution.datasources.v2.BatchScanExec | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.test.SharedSparkSession | ||
import org.apache.spark.sql.types._ | ||
|
@@ -713,6 +715,29 @@ abstract class OrcQuerySuite extends OrcQueryTest with SharedSparkSession { | |
} | ||
} | ||
} | ||
|
||
test("SPARK-37728: Reading nested columns with ORC vectorized reader should not " + | ||
"cause ArrayIndexOutOfBoundsException") { | ||
withTempPath { dir => | ||
val path = dir.getCanonicalPath | ||
val df = spark.range(100).map { _ => | ||
val arrayColumn = (0 until 50).map(_ => (0 until 1000).map(k => k.toString)) | ||
arrayColumn | ||
}.toDF("record").repartition(1) | ||
df.write.format("orc").save(path) | ||
|
||
withSQLConf(SQLConf.ORC_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key -> "true", | ||
SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "10000") { | ||
val readDf = spark.read.orc(path) | ||
val vectorizationEnabled = readDf.queryExecution.executedPlan.find { | ||
case scan @ (_: FileSourceScanExec | _: BatchScanExec) => scan.supportsColumnar | ||
case _ => false | ||
}.isDefined | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you remove, Since this test case is about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Because when testing with OrcV2QuerySuite, method To fix this issue, I think #33626 should also be backported to branch-3.2. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Thank you for the background, @yym1995 . |
||
assert(vectorizationEnabled) | ||
checkAnswer(readDf, df) | ||
} | ||
} | ||
} | ||
} | ||
|
||
class OrcV1QuerySuite extends OrcQuerySuite { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indentation? We need two more spaces.