From e7f463c1865d8518a5eb1756b9a548442fb776c1 Mon Sep 17 00:00:00 2001 From: Teddy Choi Date: Tue, 10 Jul 2018 22:07:13 +0900 Subject: [PATCH] HIVE-20044: Arrow Serde should pad char values and handle empty strings correctly (Teddy Choi) Change-Id: Ia87503aabf38c9599b887795d2b62566d8965f9b --- .../hadoop/hive/ql/io/arrow/Serializer.java | 29 ++++++++++++++++++- .../io/arrow/TestArrowColumnarBatchSerDe.java | 27 +++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java index 2961050532fd..8d8b6fd6e0c6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java @@ -62,6 +62,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; @@ -71,6 +72,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ARROW_BATCH_SIZE; @@ -87,6 +89,7 @@ class Serializer { private final int MAX_BUFFERED_ROWS; + private final static byte[] EMPTY_BYTES = new byte[0]; // Schema private final StructTypeInfo structTypeInfo; @@ -415,7 +418,31 @@ private void writePrimitive(FieldVector arrowVector, ColumnVector hiveVector, Ty if (hiveVector.isNull[i]) { varCharVector.setNull(i); } else { - varCharVector.setSafe(i, bytesVector.vector[i], bytesVector.start[i], bytesVector.length[i]); + byte[] bytes = bytesVector.vector[i]; + int length = bytesVector.length[i]; + int start = bytesVector.start[i]; + + if (bytes == null) { + bytes = EMPTY_BYTES; + start = 0; + length = 0; + } + + // Char padding + if (primitiveCategory == PrimitiveObjectInspector.PrimitiveCategory.CHAR) { + final CharTypeInfo charTypeInfo = (CharTypeInfo) typeInfo; + final int paddedLength = charTypeInfo.getLength(); + final int paddedStart = 0; + final byte[] paddedBytes = new byte[paddedLength]; + + System.arraycopy(bytes, start, paddedBytes, paddedStart, length); + Arrays.fill(paddedBytes, length, paddedLength, (byte) ' '); + + length = paddedLength; + start = paddedStart; + bytes = paddedBytes; + } + varCharVector.setSafe(i, bytes, start, length); } } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java index c9a5812e4716..10f01473d564 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java @@ -20,6 +20,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import org.apache.arrow.vector.VarCharVector; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -768,6 +769,32 @@ public void testMapBinary() throws SerDeException { initAndSerializeAndDeserialize(schema, toMap(BINARY_ROWS)); } + @Test + public void testPrimitiveCharPadding() throws SerDeException { + String[][] schema = { + {"char1", "char(10)"}, + }; + + HiveCharWritable[][] rows = new HiveCharWritable[][] { + {charW("Hello", 10)}, {charW("world!", 10)}}; + ArrowColumnarBatchSerDe serDe = new ArrowColumnarBatchSerDe(); + StructObjectInspector rowOI = initSerDe(serDe, schema); + + ArrowWrapperWritable serialized = null; + for (Object[] row : rows) { + serialized = serDe.serialize(row, rowOI); + } + // Pass null to complete a batch + if (serialized == null) { + serialized = serDe.serialize(null, rowOI); + } + + VarCharVector varCharVector = (VarCharVector) serialized.getVectorSchemaRoot().getFieldVectors().get(0); + for (int i = 0; i < rows.length; i++) { + assertEquals(rows[i][0].getPaddedValue().toString(), new String(varCharVector.get(i))); + } + } + public void testMapDecimal() throws SerDeException { String[][] schema = { {"decimal_map", "map"},