From 7b4ef6676dcc103d6149eda0365541cb3ba8cabd Mon Sep 17 00:00:00 2001 From: Teddy Choi Date: Tue, 30 Oct 2018 07:47:14 +0900 Subject: [PATCH] HIVE-20827: Inconsistent results for empty arrays (Teddy Choi) Signed-off-by: Teddy Choi --- data/files/empty_array.txt | 1 + .../resources/testconfiguration.properties | 1 + .../test/queries/clientpositive/empty_array.q | 11 +++ .../results/clientpositive/empty_array.q.out | 70 +++++++++++++++++++ .../clientpositive/llap/empty_array.q.out | 70 +++++++++++++++++++ .../lazy/fast/LazySimpleDeserializeRead.java | 24 ++++++- .../hive/serde2/lazy/TestLazySimpleFast.java | 32 +++++++++ 7 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 data/files/empty_array.txt create mode 100644 ql/src/test/queries/clientpositive/empty_array.q create mode 100644 ql/src/test/results/clientpositive/empty_array.q.out create mode 100644 ql/src/test/results/clientpositive/llap/empty_array.q.out diff --git a/data/files/empty_array.txt b/data/files/empty_array.txt new file mode 100644 index 000000000000..7edb2fa5bce5 --- /dev/null +++ b/data/files/empty_array.txt @@ -0,0 +1 @@ +, diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index da2091ac5d2b..f87f033444df 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -136,6 +136,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ dynpart_sort_optimization2.q,\ + empty_array.q,\ empty_join.q,\ enforce_order.q,\ filter_join_breaktask.q,\ diff --git a/ql/src/test/queries/clientpositive/empty_array.q b/ql/src/test/queries/clientpositive/empty_array.q new file mode 100644 index 000000000000..dff24e3fad2d --- /dev/null +++ b/ql/src/test/queries/clientpositive/empty_array.q @@ -0,0 +1,11 @@ +create table dtypes3 (c5 array, c13 array>) row format delimited fields terminated by ',' stored as TEXTFILE; +load data local inpath '../../data/files/empty_array.txt' into table dtypes3; +create table dtypes4 (c5 array, c13 array>) stored as ORC; +create table dtypes5 (c5 array, c13 array>) stored as TEXTFILE; + +SET hive.vectorized.execution.enabled=true; +insert into dtypes4 select * from dtypes3; +insert into dtypes5 select * from dtypes3; + +select * from dtypes4; +select * from dtypes5; diff --git a/ql/src/test/results/clientpositive/empty_array.q.out b/ql/src/test/results/clientpositive/empty_array.q.out new file mode 100644 index 000000000000..881bdcf7d519 --- /dev/null +++ b/ql/src/test/results/clientpositive/empty_array.q.out @@ -0,0 +1,70 @@ +PREHOOK: query: create table dtypes3 (c5 array, c13 array>) row format delimited fields terminated by ',' stored as TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes3 +POSTHOOK: query: create table dtypes3 (c5 array, c13 array>) row format delimited fields terminated by ',' stored as TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes3 +PREHOOK: query: load data local inpath '../../data/files/empty_array.txt' into table dtypes3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@dtypes3 +POSTHOOK: query: load data local inpath '../../data/files/empty_array.txt' into table dtypes3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@dtypes3 +PREHOOK: query: create table dtypes4 (c5 array, c13 array>) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes4 +POSTHOOK: query: create table dtypes4 (c5 array, c13 array>) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes4 +PREHOOK: query: create table dtypes5 (c5 array, c13 array>) stored as TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes5 +POSTHOOK: query: create table dtypes5 (c5 array, c13 array>) stored as TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes5 +PREHOOK: query: insert into dtypes4 select * from dtypes3 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes3 +PREHOOK: Output: default@dtypes4 +POSTHOOK: query: insert into dtypes4 select * from dtypes3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes3 +POSTHOOK: Output: default@dtypes4 +POSTHOOK: Lineage: dtypes4.c13 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c13, type:array>, comment:null), ] +POSTHOOK: Lineage: dtypes4.c5 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c5, type:array, comment:null), ] +PREHOOK: query: insert into dtypes5 select * from dtypes3 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes3 +PREHOOK: Output: default@dtypes5 +POSTHOOK: query: insert into dtypes5 select * from dtypes3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes3 +POSTHOOK: Output: default@dtypes5 +POSTHOOK: Lineage: dtypes5.c13 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c13, type:array>, comment:null), ] +POSTHOOK: Lineage: dtypes5.c5 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c5, type:array, comment:null), ] +PREHOOK: query: select * from dtypes4 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes4 +#### A masked pattern was here #### +POSTHOOK: query: select * from dtypes4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes4 +#### A masked pattern was here #### +[] [] +PREHOOK: query: select * from dtypes5 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes5 +#### A masked pattern was here #### +POSTHOOK: query: select * from dtypes5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes5 +#### A masked pattern was here #### +[] [] diff --git a/ql/src/test/results/clientpositive/llap/empty_array.q.out b/ql/src/test/results/clientpositive/llap/empty_array.q.out new file mode 100644 index 000000000000..881bdcf7d519 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/empty_array.q.out @@ -0,0 +1,70 @@ +PREHOOK: query: create table dtypes3 (c5 array, c13 array>) row format delimited fields terminated by ',' stored as TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes3 +POSTHOOK: query: create table dtypes3 (c5 array, c13 array>) row format delimited fields terminated by ',' stored as TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes3 +PREHOOK: query: load data local inpath '../../data/files/empty_array.txt' into table dtypes3 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@dtypes3 +POSTHOOK: query: load data local inpath '../../data/files/empty_array.txt' into table dtypes3 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@dtypes3 +PREHOOK: query: create table dtypes4 (c5 array, c13 array>) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes4 +POSTHOOK: query: create table dtypes4 (c5 array, c13 array>) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes4 +PREHOOK: query: create table dtypes5 (c5 array, c13 array>) stored as TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dtypes5 +POSTHOOK: query: create table dtypes5 (c5 array, c13 array>) stored as TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dtypes5 +PREHOOK: query: insert into dtypes4 select * from dtypes3 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes3 +PREHOOK: Output: default@dtypes4 +POSTHOOK: query: insert into dtypes4 select * from dtypes3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes3 +POSTHOOK: Output: default@dtypes4 +POSTHOOK: Lineage: dtypes4.c13 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c13, type:array>, comment:null), ] +POSTHOOK: Lineage: dtypes4.c5 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c5, type:array, comment:null), ] +PREHOOK: query: insert into dtypes5 select * from dtypes3 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes3 +PREHOOK: Output: default@dtypes5 +POSTHOOK: query: insert into dtypes5 select * from dtypes3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes3 +POSTHOOK: Output: default@dtypes5 +POSTHOOK: Lineage: dtypes5.c13 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c13, type:array>, comment:null), ] +POSTHOOK: Lineage: dtypes5.c5 SIMPLE [(dtypes3)dtypes3.FieldSchema(name:c5, type:array, comment:null), ] +PREHOOK: query: select * from dtypes4 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes4 +#### A masked pattern was here #### +POSTHOOK: query: select * from dtypes4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes4 +#### A masked pattern was here #### +[] [] +PREHOOK: query: select * from dtypes5 +PREHOOK: type: QUERY +PREHOOK: Input: default@dtypes5 +#### A masked pattern was here #### +POSTHOOK: query: select * from dtypes5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dtypes5 +#### A masked pattern was here #### +[] [] diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java index 45c44da734ff..7b8aae4a9287 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java @@ -950,7 +950,29 @@ public boolean isNextComplexMultiValue() { case LIST: { // Allow for empty string, etc. - final boolean isNext = (fieldPosition <= complexFieldEnd); + final ListComplexTypeHelper listHelper = (ListComplexTypeHelper) complexTypeHelper; + final boolean isElementStringFamily; + final Field elementField = listHelper.elementField; + if (elementField.isPrimitive) { + switch (elementField.primitiveCategory) { + case STRING: + case VARCHAR: + case CHAR: + isElementStringFamily = true; + break; + default: + isElementStringFamily = false; + break; + } + } else { + isElementStringFamily = false; + } + final boolean isNext; + if (isElementStringFamily) { + isNext = (fieldPosition <= complexFieldEnd); + } else { + isNext = (fieldPosition < complexFieldEnd); + } if (!isNext) { popComplexType(); } diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java index fbb6040331c6..34b51c8fa96f 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java @@ -20,10 +20,12 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Properties; import java.util.Random; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.SerDeException; @@ -37,7 +39,10 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; @@ -387,4 +392,31 @@ public void testLazyBinarySimpleComplexDepthOne() throws Throwable { public void testLazyBinarySimpleComplexDepthFour() throws Throwable { testLazySimpleFast(SerdeRandomRowSource.SupportedTypes.ALL, 4); } + + public void testLazySimpleDeserializeRowEmptyArray() throws Throwable { + HiveConf hconf = new HiveConf(); + + // set the escaping related properties + Properties props = new Properties(); + props.setProperty(serdeConstants.FIELD_DELIM, ","); + + LazySerDeParameters lazyParams = + new LazySerDeParameters(hconf, props, + LazySimpleSerDe.class.getName()); + + TypeInfo[] typeInfos = new TypeInfo[] { + TypeInfoFactory.getListTypeInfo( + TypeInfoFactory.intTypeInfo), + TypeInfoFactory.getListTypeInfo( + TypeInfoFactory.getListTypeInfo( + TypeInfoFactory.stringTypeInfo))}; + LazySimpleDeserializeRead deserializeRead = + new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams); + + byte[] bytes = ",".getBytes(); + deserializeRead.set(bytes, 0, bytes.length); + verifyRead(deserializeRead, typeInfos[0], Collections.emptyList()); + verifyRead(deserializeRead, typeInfos[1], Collections.emptyList()); + TestCase.assertTrue(deserializeRead.isEndOfInputReached()); + } } \ No newline at end of file