-
Notifications
You must be signed in to change notification settings - Fork 198
Description
Describe the bug
Large batches after join operations exceed Arrow Vector 2GB memory limit, causing OversizedAllocationException.
To Reproduce
./bin/spark-sql \
--conf spark.sql.autoBroadcastJoinThreshold=-1 \
--conf spark.sql.adaptive.autoBroadcastJoinThreshold=-1create table tmp_t1(a int, b int) stored as orc;
with g1 as (select id as a from range(1)),
g2 as (select id as b from range(10000))
insert overwrite tmp_t1 select g1.a, g2.b from from g1 join g2;
create table tmp_t2(a int, b int) stored as orc;
with g1 as (select id as a from range(1)),
g2 as (select id as b from range(10000))
insert overwrite tmp_t2 select g1.a, g2.b from from g1 join g2;
select s, count(1) as cnt
from (select concat(
cast(date_add('2010-01-01', t1.b) as string),
cast(date_add('2010-01-02', t2.b) as string)
) as s
from tmp_t1 t1 join tmp_t2 t2 on t1.a = t2.a)
group by s
order by cnt
limit 100;Exception in thread "auron native task 0.0 in stage 4.0 (TID 10)" auron.org.apache.arrow.vector.util.OversizedAllocationException: Memory required for vector is (2147483648), which is overflow or more than max allowed (2147483647). You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types
at auron.org.apache.arrow.vector.BaseVariableWidthVector.checkDataBufferSize(BaseVariableWidthVector.java:465)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.reallocDataBuffer(BaseVariableWidthVector.java:574)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.handleSafe(BaseVariableWidthVector.java:1344)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.setSafe(BaseVariableWidthVector.java:1178)
at org.apache.spark.sql.execution.auron.arrowio.util.StringWriter.setValue(ArrowWriter.scala:247)
at org.apache.spark.sql.execution.auron.arrowio.util.ArrowFieldWriter.write(ArrowWriter.scala:126)
at org.apache.spark.sql.execution.auron.arrowio.util.ArrowWriter.write(ArrowWriter.scala:97)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$5(SparkAuronUDFWrapperContext.scala:78)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$5$adapted(SparkAuronUDFWrapperContext.scala:76)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$4(SparkAuronUDFWrapperContext.scala:76)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$4$adapted(SparkAuronUDFWrapperContext.scala:69)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$9(Using.scala:395)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$8(Using.scala:394)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$7(Using.scala:393)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$6(Using.scala:392)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.resources(Using.scala:391)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.eval(SparkAuronUDFWrapperContext.scala:69)Expected behavior
Query should execute without memory allocation errors.
Screenshots
Additional context
Metadata
Metadata
Assignees
Labels
No labels