From d219f7fb3949a8b5e41a36c3f6fc2b7b3820ba84 Mon Sep 17 00:00:00 2001 From: "Chen, Junjie" Date: Fri, 13 Jul 2018 11:29:14 +0800 Subject: [PATCH] HIVE-17593: DataWritableWriter strip spaces for CHAR type which cause PPD not work --- .../exec/vector/expressions/StringLength.java | 37 +++++++++++++++---- .../io/parquet/write/DataWritableWriter.java | 2 +- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java index 956fd7b7b544..f8b1995b48df 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -25,6 +26,8 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; /** * Calculate the length of the strings in the input column vector, and store @@ -74,11 +77,18 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { // We do not need to do a column reset since we are carefully changing the output. outputColVector.isRepeating = false; + // We do not need to consider tailing spaces for CHAR type. + PrimitiveCategory category = PrimitiveCategory.STRING; + if (this.inputTypeInfos != null && this.inputTypeInfos.length != 0) { + PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) this.inputTypeInfos[0]; + category = typeInfo.getPrimitiveCategory(); + } + if (inputColVector.isRepeating) { if (inputColVector.noNulls || !inputIsNull[0]) { // Set isNull before call in case it changes it mind. outputIsNull[0] = false; - resultLen[0] = utf8StringLength(vector[0], start[0], length[0]); + resultLen[0] = utf8StringLength(vector[0], start[0], length[0], category); } else { outputIsNull[0] = true; outputColVector.noNulls = false; @@ -97,12 +107,12 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { final int i = sel[j]; // Set isNull before call in case it changes it mind. outputIsNull[i] = false; - resultLen[i] = utf8StringLength(vector[i], start[i], length[i]); + resultLen[i] = utf8StringLength(vector[i], start[i], length[i], category); } } else { for(int j = 0; j != n; j++) { final int i = sel[j]; - resultLen[i] = utf8StringLength(vector[i], start[i], length[i]); + resultLen[i] = utf8StringLength(vector[i], start[i], length[i], category); } } } else { @@ -114,7 +124,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { outputColVector.noNulls = true; } for(int i = 0; i != n; i++) { - resultLen[i] = utf8StringLength(vector[i], start[i], length[i]); + resultLen[i] = utf8StringLength(vector[i], start[i], length[i], category); } } } else /* there are nulls in the inputColVector */ { @@ -127,7 +137,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { int i = sel[j]; outputColVector.isNull[i] = inputColVector.isNull[i]; if (!inputColVector.isNull[i]) { - resultLen[i] = utf8StringLength(vector[i], start[i], length[i]); + resultLen[i] = utf8StringLength(vector[i], start[i], length[i], category); } } outputColVector.isRepeating = false; @@ -135,7 +145,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { for(int i = 0; i != n; i++) { outputColVector.isNull[i] = inputColVector.isNull[i]; if (!inputColVector.isNull[i]) { - resultLen[i] = utf8StringLength(vector[i], start[i], length[i]); + resultLen[i] = utf8StringLength(vector[i], start[i], length[i], category); } } } @@ -146,7 +156,7 @@ public void evaluate(VectorizedRowBatch batch) throws HiveException { * Return length in characters of UTF8 string in byte array * beginning at start that is len bytes long. */ - static long utf8StringLength(byte[] s, int start, int len) { + static long utf8StringLength(byte[] s, int start, int len, PrimitiveCategory category) { long resultLength = 0; for (int i = start; i < start + len; i++) { @@ -158,6 +168,19 @@ static long utf8StringLength(byte[] s, int start, int len) { resultLength++; } } + + // Adjust length if the column type is CHAR + if (category == PrimitiveCategory.CHAR) { + String tmp = new String(s, start, len, StandardCharsets.UTF_8); + for(int i = tmp.length() - 1; i >= 0; i--) { + if (Character.isWhitespace(tmp.charAt(i))) { + resultLength--; + } else { + break; + } + } + } + return resultLength; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java index 3d61c33afd2e..c8330270ec46 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java @@ -453,7 +453,7 @@ public CharDataWriter(HiveCharObjectInspector inspector) { @Override public void write(Object value) { - String v = inspector.getPrimitiveJavaObject(value).getStrippedValue(); + String v = inspector.getPrimitiveJavaObject(value).getPaddedValue(); recordConsumer.addBinary(Binary.fromString(v)); } }