From 5b45628be0ad4462264b1724a22505c636f69afc Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Fri, 25 Mar 2016 19:39:12 -0700 Subject: [PATCH] HIVE-11417. Move the ReaderImpl and RowReaderImpl to the ORC module, by making shims for the row by row reader. --- bin/ext/orcfiledump.cmd | 2 +- bin/ext/orcfiledump.sh | 2 +- .../hcatalog/streaming/TestStreaming.java | 9 +- .../io/decode/OrcEncodedDataConsumer.java | 2 +- .../llap/io/encoded/OrcEncodedDataReader.java | 15 +- .../llap/io/metadata/OrcFileMetadata.java | 4 +- orc/pom.xml | 27 + .../org/apache/orc}/FileFormatException.java | 2 +- orc/src/java/org/apache/orc/OrcFile.java | 6 + orc/src/java/org/apache/orc/Reader.java | 2 +- .../java/org/apache/orc/TypeDescription.java | 18 +- .../java/org/apache/orc/impl/AcidStats.java | 60 + .../orc/impl}/ConvertTreeReaderFactory.java | 974 +-------- .../java/org/apache/orc/impl/HadoopShims.java | 79 + .../apache/orc/impl/HadoopShimsCurrent.java | 30 + .../org/apache/orc/impl/HadoopShims_2_2.java | 71 +- .../org/apache/orc/impl/IntegerReader.java | 3 +- .../org/apache/orc/impl/OrcAcidUtils.java | 85 + .../java/org/apache/orc/impl/ReaderImpl.java | 758 +++++++ .../org/apache/orc/impl/RecordReaderImpl.java | 1215 +++++++++++ .../apache/orc/impl}/RecordReaderUtils.java | 36 +- .../org/apache/orc/impl}/SchemaEvolution.java | 2 +- .../apache/orc/impl}/TreeReaderFactory.java | 446 +--- .../org/apache/orc/impl}/ZeroCopyShims.java | 23 +- .../java/org/apache/orc/tools}/FileDump.java | 230 ++- .../org/apache/orc/tools}/JsonFileDump.java | 15 +- .../org/apache}/orc/TestColumnStatistics.java | 102 +- .../apache}/orc/TestNewIntegerEncoding.java | 1015 ++++----- .../apache}/orc/TestOrcNullOptimization.java | 339 +-- .../org/apache}/orc/TestOrcTimezone1.java | 79 +- .../org/apache}/orc/TestOrcTimezone2.java | 33 +- .../org/apache}/orc/TestStringDictionary.java | 155 +- .../org/apache}/orc/TestTypeDescription.java | 2 +- .../org/apache}/orc/TestUnrolledBitPack.java | 30 +- .../org/apache}/orc/TestVectorOrcFile.java | 187 +- .../apache/orc/impl}/TestOrcWideTable.java | 4 +- .../test/org/apache/orc/impl}/TestRLEv2.java | 98 +- .../org/apache/orc/impl}/TestReaderImpl.java | 7 +- .../orc/impl}/TestRecordReaderImpl.java | 197 +- .../org/apache/orc/impl}/TestStreamName.java | 3 +- .../org/apache/orc/tools}/TestFileDump.java | 328 +-- .../apache/orc/tools}/TestJsonFileDump.java | 65 +- orc/src/test/resources/orc-file-11-format.orc | Bin 0 -> 373336 bytes .../resources/orc-file-dump-bloomfilter.out | 0 .../resources/orc-file-dump-bloomfilter2.out | 0 .../orc-file-dump-dictionary-threshold.out | 0 .../src/test/resources/orc-file-dump.json | 0 .../src/test/resources/orc-file-dump.out | 0 .../src/test/resources/orc-file-has-null.out | 0 .../expressions/CastDecimalToTimestamp.java | 8 +- .../expressions/CastDoubleToTimestamp.java | 13 +- .../expressions/CastLongToTimestamp.java | 4 +- .../CastMillisecondsLongToTimestamp.java | 7 +- .../hive/ql/hooks/PostExecOrcFileDump.java | 4 +- .../hive/ql/io/orc/OrcRawRecordMerger.java | 35 +- .../hive/ql/io/orc/OrcRecordUpdater.java | 71 +- .../hadoop/hive/ql/io/orc/ReaderImpl.java | 509 +---- .../hive/ql/io/orc/RecordReaderImpl.java | 1823 +++++++---------- .../ql/io/orc/encoded/EncodedReaderImpl.java | 2 +- .../orc/encoded/EncodedTreeReaderFactory.java | 2 +- .../hadoop/hive/ql/TestTxnCommands.java | 2 - .../TestTimestampWritableAndColumnVector.java | 7 +- .../expressions/TestVectorTypeCasts.java | 10 +- .../exec/vector/udf/TestVectorUDFAdaptor.java | 2 - .../hadoop/hive/ql/io/orc/TestOrcFile.java | 70 +- .../hive/ql/io/orc/TestOrcRecordUpdater.java | 4 +- .../results/clientpositive/orc_create.q.out | 12 +- .../orc_int_type_promotion.q.out | 12 +- ...l_orc_vec_mapwork_part_all_primitive.q.out | 40 +- ...l_orc_vec_mapwork_part_all_primitive.q.out | 40 +- .../clientpositive/vector_complex_all.q.out | 6 +- .../hive/serde2/io/TimestampWritable.java | 114 +- .../PrimitiveObjectInspectorUtils.java | 7 +- .../hive/serde2/io/TestTimestampWritable.java | 41 +- .../hadoop/hive/shims/Hadoop23Shims.java | 63 +- .../apache/hadoop/hive/shims/HadoopShims.java | 70 - .../hadoop/hive/shims/HadoopShimsSecure.java | 29 - .../ql/exec/vector/TimestampColumnVector.java | 9 +- .../exec/vector/expressions/StringExpr.java | 0 .../hive/ql/io/sarg/SearchArgumentImpl.java | 16 +- .../hadoop/hive/ql/util/TimestampUtils.java | 94 + 81 files changed, 5063 insertions(+), 4823 deletions(-) rename {ql/src/java/org/apache/hadoop/hive/ql/io => orc/src/java/org/apache/orc}/FileFormatException.java (96%) create mode 100644 orc/src/java/org/apache/orc/impl/AcidStats.java rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/impl}/ConvertTreeReaderFactory.java (76%) create mode 100644 orc/src/java/org/apache/orc/impl/OrcAcidUtils.java create mode 100644 orc/src/java/org/apache/orc/impl/ReaderImpl.java create mode 100644 orc/src/java/org/apache/orc/impl/RecordReaderImpl.java rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/impl}/RecordReaderUtils.java (95%) rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/impl}/SchemaEvolution.java (99%) rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/impl}/TreeReaderFactory.java (84%) rename {shims/0.23/src/main/java/org/apache/hadoop/hive/shims => orc/src/java/org/apache/orc/impl}/ZeroCopyShims.java (77%) rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/tools}/FileDump.java (82%) rename {ql/src/java/org/apache/hadoop/hive/ql/io/orc => orc/src/java/org/apache/orc/tools}/JsonFileDump.java (96%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestColumnStatistics.java (85%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestNewIntegerEncoding.java (51%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestOrcNullOptimization.java (52%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestOrcTimezone1.java (73%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestOrcTimezone2.java (82%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestStringDictionary.java (61%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestTypeDescription.java (98%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestUnrolledBitPack.java (81%) rename {ql/src/test/org/apache/hadoop/hive/ql/io => orc/src/test/org/apache}/orc/TestVectorOrcFile.java (95%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/impl}/TestOrcWideTable.java (97%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/impl}/TestRLEv2.java (81%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/impl}/TestReaderImpl.java (96%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/impl}/TestRecordReaderImpl.java (90%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/impl}/TestStreamName.java (95%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/tools}/TestFileDump.java (54%) rename {ql/src/test/org/apache/hadoop/hive/ql/io/orc => orc/src/test/org/apache/orc/tools}/TestJsonFileDump.java (71%) create mode 100644 orc/src/test/resources/orc-file-11-format.orc rename {ql => orc}/src/test/resources/orc-file-dump-bloomfilter.out (100%) rename {ql => orc}/src/test/resources/orc-file-dump-bloomfilter2.out (100%) rename {ql => orc}/src/test/resources/orc-file-dump-dictionary-threshold.out (100%) rename {ql => orc}/src/test/resources/orc-file-dump.json (100%) rename {ql => orc}/src/test/resources/orc-file-dump.out (100%) rename {ql => orc}/src/test/resources/orc-file-has-null.out (100%) rename {ql => storage-api}/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java (100%) create mode 100644 storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java diff --git a/bin/ext/orcfiledump.cmd b/bin/ext/orcfiledump.cmd index f78ed7f618ef..ff4b41013c14 100644 --- a/bin/ext/orcfiledump.cmd +++ b/bin/ext/orcfiledump.cmd @@ -14,7 +14,7 @@ @rem See the License for the specific language governing permissions and @rem limitations under the License. -set CLASS=org.apache.hadoop.hive.ql.io.orc.FileDump +set CLASS=org.apache.orc.tools.FileDump set HIVE_OPTS= set HADOOP_CLASSPATH= diff --git a/bin/ext/orcfiledump.sh b/bin/ext/orcfiledump.sh index 74f1a1ed0bd2..c84e61c87ad5 100644 --- a/bin/ext/orcfiledump.sh +++ b/bin/ext/orcfiledump.sh @@ -17,7 +17,7 @@ THISSERVICE=orcfiledump export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " orcfiledump () { - CLASS=org.apache.hadoop.hive.ql.io.orc.FileDump + CLASS=org.apache.orc.tools.FileDump HIVE_OPTS='' execHiveCmd $CLASS "$@" } diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java index 601642506682..4d2a2ee820d8 100644 --- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java +++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java @@ -57,16 +57,15 @@ import org.apache.hadoop.hive.metastore.api.TxnAbortedException; import org.apache.hadoop.hive.metastore.api.TxnInfo; import org.apache.hadoop.hive.metastore.api.TxnState; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.metastore.txn.TxnDbUtil; import org.apache.hadoop.hive.ql.CommandNeedRetryException; import org.apache.hadoop.hive.ql.Driver; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.IOConstants; -import org.apache.hadoop.hive.ql.io.orc.FileDump; +import org.apache.orc.impl.OrcAcidUtils; +import org.apache.orc.tools.FileDump; import org.apache.hadoop.hive.ql.io.orc.OrcFile; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater; import org.apache.hadoop.hive.ql.io.orc.OrcStruct; import org.apache.hadoop.hive.ql.io.orc.Reader; import org.apache.hadoop.hive.ql.io.orc.RecordReader; @@ -1089,7 +1088,7 @@ private ArrayList dumpBucket(Path orcFile) throws IOException { Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs)); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader .getObjectInspector(); @@ -1561,7 +1560,7 @@ private void corruptSideFile(final String file, final HiveConf conf, final Map> offsetMap, final String key, final int numEntries) throws IOException { Path dataPath = new Path(file); - Path sideFilePath = OrcRecordUpdater.getSideFile(dataPath); + Path sideFilePath = OrcAcidUtils.getSideFile(dataPath); Path cPath = new Path(sideFilePath.getParent(), sideFilePath.getName() + ".corrupt"); FileSystem fs = sideFilePath.getFileSystem(conf); List offsets = offsetMap.get(key); diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java index a689f106dd9b..619d1a47dff5 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java @@ -42,7 +42,7 @@ import org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch; import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl; -import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory; +import org.apache.orc.impl.TreeReaderFactory; import org.apache.hadoop.hive.ql.io.orc.WriterImpl; import org.apache.orc.OrcProto; diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java index 7effe6944fdc..69c064730f79 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java @@ -67,13 +67,12 @@ import org.apache.hadoop.hive.ql.io.orc.OrcSplit; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader; import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl; -import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.SargApplier; import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedOrcFile; import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedReader; import org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.PoolFactory; -import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils; +import org.apache.orc.impl.RecordReaderUtils; import org.apache.orc.StripeInformation; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.mapred.FileSplit; @@ -343,7 +342,8 @@ protected Void performDataRead() throws IOException { // intermediate changes for individual columns will unset values in the array. // Skip this case for 0-column read. We could probably special-case it just like we do // in EncodedReaderImpl, but for now it's not that important. - if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue; + if (colRgs.length > 0 && colRgs[0] == + RecordReaderImpl.SargApplier.READ_NO_RGS) continue; // 6.1. Determine the columns to read (usually the same as requested). if (cols == null || cols.size() == colRgs.length) { @@ -691,12 +691,13 @@ public void returnData(OrcEncodedColumnBatch ecb) { */ private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride, ArrayList metadata) throws IOException { - SargApplier sargApp = null; + RecordReaderImpl.SargApplier sargApp = null; if (sarg != null && rowIndexStride != 0) { List types = fileMetadata.getTypes(); String[] colNamesForSarg = OrcInputFormat.getSargColumnNames( columnNames, types, globalIncludes, fileMetadata.isOriginalFormat()); - sargApp = new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length); + sargApp = new RecordReaderImpl.SargApplier(sarg, colNamesForSarg, + rowIndexStride, types, globalIncludes.length); } boolean hasAnyData = false; // readState should have been initialized by this time with an empty array. @@ -710,8 +711,8 @@ private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride, rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterIndexes(), true); } - boolean isNone = rgsToRead == SargApplier.READ_NO_RGS, - isAll = rgsToRead == SargApplier.READ_ALL_RGS; + boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS, + isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS; hasAnyData = hasAnyData || !isNone; if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) { if (isNone) { diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java index 4e42a0f58e23..c9b0a4d7160c 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java @@ -29,11 +29,11 @@ import org.apache.hadoop.hive.ql.io.SyntheticFileId; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.Reader; -import org.apache.hadoop.hive.ql.io.orc.ReaderImpl.StripeInformationImpl; import org.apache.orc.CompressionKind; import org.apache.orc.FileMetadata; import org.apache.orc.OrcProto; import org.apache.orc.StripeInformation; +import org.apache.orc.impl.ReaderImpl; /** ORC file metadata. Currently contains some duplicate info due to how different parts * of ORC use different info. Ideally we would get rid of protobuf structs in code beyond reading, @@ -72,7 +72,7 @@ public final class OrcFileMetadata extends LlapCacheableBuffer implements FileMe @VisibleForTesting public static OrcFileMetadata createDummy(Object fileKey) { OrcFileMetadata ofm = new OrcFileMetadata(fileKey); - ofm.stripes.add(new StripeInformationImpl( + ofm.stripes.add(new ReaderImpl.StripeInformationImpl( OrcProto.StripeInformation.getDefaultInstance())); ofm.fileStats.add(OrcProto.ColumnStatistics.getDefaultInstance()); ofm.stripeStats.add(OrcProto.StripeStatistics.newBuilder().addColStats(createStatsDummy()).build()); diff --git a/orc/pom.xml b/orc/pom.xml index 2d80c9747355..cc270771b8c3 100644 --- a/orc/pom.xml +++ b/orc/pom.xml @@ -71,6 +71,33 @@ + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version} + + + javax.servlet + servlet-api + + + javax.servlet.jsp + jsp-api + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + org.apache.avro + avro + + + org.iq80.snappy snappy diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java b/orc/src/java/org/apache/orc/FileFormatException.java similarity index 96% rename from ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java rename to orc/src/java/org/apache/orc/FileFormatException.java index 12417aab2169..2cebea78f960 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java +++ b/orc/src/java/org/apache/orc/FileFormatException.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io; +package org.apache.orc; import java.io.IOException; diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java index 85506ff35750..7dd73334dd53 100644 --- a/orc/src/java/org/apache/orc/OrcFile.java +++ b/orc/src/java/org/apache/orc/OrcFile.java @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.impl.MemoryManager; +import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.WriterImpl; /** @@ -212,6 +213,11 @@ public static ReaderOptions readerOptions(Configuration conf) { return new ReaderOptions(conf); } + public static Reader createReader(Path path, + ReaderOptions options) throws IOException { + return new ReaderImpl(path, options); + } + public interface WriterContext { Writer getWriter(); } diff --git a/orc/src/java/org/apache/orc/Reader.java b/orc/src/java/org/apache/orc/Reader.java index 39de763bcafd..87f3293fe4d8 100644 --- a/orc/src/java/org/apache/orc/Reader.java +++ b/orc/src/java/org/apache/orc/Reader.java @@ -334,7 +334,7 @@ public String toString() { * @return a new RecordReader * @throws IOException */ - RecordReader rowsOptions(Options options) throws IOException; + RecordReader rows(Options options) throws IOException; /** * @return List of integers representing version of the file, in order from major to minor. diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java index b8e057eed052..ffe3c1ffed73 100644 --- a/orc/src/java/org/apache/orc/TypeDescription.java +++ b/orc/src/java/org/apache/orc/TypeDescription.java @@ -344,25 +344,25 @@ private ColumnVector createColumn(int maxSize) { case INT: case LONG: case DATE: - return new LongColumnVector(); + return new LongColumnVector(maxSize); case TIMESTAMP: - return new TimestampColumnVector(); + return new TimestampColumnVector(maxSize); case FLOAT: case DOUBLE: - return new DoubleColumnVector(); + return new DoubleColumnVector(maxSize); case DECIMAL: - return new DecimalColumnVector(precision, scale); + return new DecimalColumnVector(maxSize, precision, scale); case STRING: case BINARY: case CHAR: case VARCHAR: - return new BytesColumnVector(); + return new BytesColumnVector(maxSize); case STRUCT: { ColumnVector[] fieldVector = new ColumnVector[children.size()]; for(int i=0; i < fieldVector.length; ++i) { fieldVector[i] = children.get(i).createColumn(maxSize); } - return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, + return new StructColumnVector(maxSize, fieldVector); } case UNION: { @@ -370,14 +370,14 @@ private ColumnVector createColumn(int maxSize) { for(int i=0; i < fieldVector.length; ++i) { fieldVector[i] = children.get(i).createColumn(maxSize); } - return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE, + return new UnionColumnVector(maxSize, fieldVector); } case LIST: - return new ListColumnVector(VectorizedRowBatch.DEFAULT_SIZE, + return new ListColumnVector(maxSize, children.get(0).createColumn(maxSize)); case MAP: - return new MapColumnVector(VectorizedRowBatch.DEFAULT_SIZE, + return new MapColumnVector(maxSize, children.get(0).createColumn(maxSize), children.get(1).createColumn(maxSize)); default: diff --git a/orc/src/java/org/apache/orc/impl/AcidStats.java b/orc/src/java/org/apache/orc/impl/AcidStats.java new file mode 100644 index 000000000000..6657fe9be831 --- /dev/null +++ b/orc/src/java/org/apache/orc/impl/AcidStats.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +/** + * Statistics about the ACID operations in an ORC file + */ +public class AcidStats { + public long inserts; + public long updates; + public long deletes; + + public AcidStats() { + inserts = 0; + updates = 0; + deletes = 0; + } + + public AcidStats(String serialized) { + String[] parts = serialized.split(","); + inserts = Long.parseLong(parts[0]); + updates = Long.parseLong(parts[1]); + deletes = Long.parseLong(parts[2]); + } + + public String serialize() { + StringBuilder builder = new StringBuilder(); + builder.append(inserts); + builder.append(","); + builder.append(updates); + builder.append(","); + builder.append(deletes); + return builder.toString(); + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append(" inserts: ").append(inserts); + builder.append(" updates: ").append(updates); + builder.append(" deletes: ").append(deletes); + return builder.toString(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java similarity index 76% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java rename to orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index 74a097ea2bde..3ba56f77f3eb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java +++ b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -15,16 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.sql.Date; import java.sql.Timestamp; -import java.util.ArrayList; import java.util.EnumMap; -import java.util.List; import java.util.Map; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -35,38 +32,20 @@ import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; import org.apache.orc.OrcProto; import org.apache.orc.TypeDescription; import org.apache.orc.TypeDescription.Category; -import org.apache.orc.impl.InStream; -import org.apache.orc.impl.PositionProvider; -import org.apache.orc.impl.StreamName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Convert ORC tree readers. */ public class ConvertTreeReaderFactory extends TreeReaderFactory { - private static final Logger LOG = - LoggerFactory.getLogger(TreeReaderFactory.class); - /** * Override methods like checkEncoding to pass-thru to the convert TreeReader. */ @@ -78,14 +57,12 @@ public static class ConvertTreeReader extends TreeReader { super(columnId); } - private static List numericTypeList = new ArrayList(); - // The ordering of types here is used to determine which numeric types // are common/convertible to one another. Probably better to rely on the // ordering explicitly defined here than to assume that the enum values // that were arbitrarily assigned in PrimitiveCategory work for our purposes. private static EnumMap numericTypes = - new EnumMap(TypeDescription.Category.class); + new EnumMap<>(TypeDescription.Category.class); static { registerNumericType(TypeDescription.Category.BOOLEAN, 1); @@ -99,7 +76,6 @@ public static class ConvertTreeReader extends TreeReader { } private static void registerNumericType(TypeDescription.Category kind, int level) { - numericTypeList.add(kind); numericTypes.put(kind, level); } @@ -121,61 +97,6 @@ protected TreeReader getStringGroupTreeReader(int columnId, } } - protected Writable getStringGroupWritable(TypeDescription fileType) - throws IOException { - switch (fileType.getCategory()) { - case STRING: - return new Text(); - case CHAR: - return new HiveCharWritable(); - case VARCHAR: - return new HiveVarcharWritable(); - default: - throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name()); - } - } - - protected Writable getStringGroupResultFromString(Object previous, - TypeDescription readerType, String string) { - switch (readerType.getCategory()) { - case STRING: - { - Text textResult; - if (previous == null) { - textResult = new Text(); - } else { - textResult = (Text) previous; - } - textResult.set(string); - return textResult; - } - case CHAR: - { - HiveCharWritable hiveCharResult; - if (previous == null) { - hiveCharResult = new HiveCharWritable(); - } else { - hiveCharResult = (HiveCharWritable) previous; - } - hiveCharResult.set(string, readerType.getMaxLength()); - return hiveCharResult; - } - case VARCHAR: - { - HiveVarcharWritable hiveVarcharResult; - if (previous == null) { - hiveVarcharResult = new HiveVarcharWritable(); - } else { - hiveVarcharResult = (HiveVarcharWritable) previous; - } - hiveVarcharResult.set(string, readerType.getMaxLength()); - return hiveVarcharResult; - } - default: - throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name()); - } - } - protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector, int elementNum, TypeDescription readerType, byte[] bytes) { assignStringGroupVectorEntry(bytesColVector, @@ -330,42 +251,6 @@ protected Date parseDateFromString(String string) { } } - protected String stringFromStringGroupTreeReader( - TreeReader stringGroupTreeReader, Writable writable, - TypeDescription fileType) throws IOException { - switch (fileType.getCategory()) { - case STRING: - { - Text readTextResult = - (Text) ((StringTreeReader) stringGroupTreeReader).next(writable); - if (readTextResult == null) { - return null; - } - return readTextResult.toString(); - } - case CHAR: - { - HiveCharWritable readHiveCharResult = - (HiveCharWritable) ((CharTreeReader) stringGroupTreeReader).next(writable); - if (readHiveCharResult == null) { - return null; - } - return readHiveCharResult.getStrippedValue().toString(); - } - case VARCHAR: - { - HiveVarcharWritable readHiveVarcharResult = - (HiveVarcharWritable) ((VarcharTreeReader) stringGroupTreeReader).next(writable); - if (readHiveVarcharResult == null) { - return null; - } - return readHiveVarcharResult.toString(); - } - default: - throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name()); - } - } - protected String stringFromBytesColumnVectorEntry( BytesColumnVector bytesColVector, int elementNum) { String string; @@ -468,69 +353,6 @@ public long downCastAnyInteger(long input, TypeDescription readerType) { } } - protected Writable anyIntegerWritable(long longValue, Object previous, - TypeDescription readerType) { - switch (readerType.getCategory()) { - case BOOLEAN: - { - BooleanWritable booleanResult; - if (previous == null) { - booleanResult = new BooleanWritable(); - } else { - booleanResult = (BooleanWritable) previous; - } - booleanResult.set(longValue != 0); - return booleanResult; - } - case BYTE: - { - ByteWritable byteResult; - if (previous == null) { - byteResult = new ByteWritable(); - } else { - byteResult = (ByteWritable) previous; - } - byteResult.set((byte) longValue); - return byteResult; - } - case SHORT: - { - ShortWritable shortResult; - if (previous == null) { - shortResult = new ShortWritable(); - } else { - shortResult = (ShortWritable) previous; - } - shortResult.set((short) longValue); - return shortResult; - } - case INT: - { - IntWritable intResult; - if (previous == null) { - intResult = new IntWritable(); - } else { - intResult = (IntWritable) previous; - } - intResult.set((int) longValue); - return intResult; - } - case LONG: - { - LongWritable longResult; - if (previous == null) { - longResult = new LongWritable(); - } else { - longResult = (LongWritable) previous; - } - longResult.set(longValue); - return longResult; - } - default: - throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name()); - } - } - protected boolean integerDownCastNeeded(TypeDescription fileType, TypeDescription readerType) { Integer fileLevel = numericTypes.get(fileType.getCategory()); Integer schemaLevel = numericTypes.get(readerType.getCategory()); @@ -571,38 +393,6 @@ public static class AnyIntegerTreeReader extends ConvertTreeReader { setConvertTreeReader(anyIntegerTreeReader); } - @Override - Object next(Object previous) throws IOException { - throw new RuntimeException("Call read() and getLong instead"); - } - - protected boolean read() throws IOException { - anyIntegerTreeReader.readValuePresent(); - if (!anyIntegerTreeReader.valuePresent) { - return false; - } - switch (fileTypeCategory) { - case BOOLEAN: - longValue = ((BooleanTreeReader) anyIntegerTreeReader).reader.next(); - break; - case BYTE: - longValue = ((ByteTreeReader) anyIntegerTreeReader).reader.next(); - break; - case SHORT: - longValue = ((ShortTreeReader) anyIntegerTreeReader).reader.next(); - break; - case INT: - longValue = ((IntTreeReader) anyIntegerTreeReader).reader.next(); - break; - case LONG: - longValue = ((LongTreeReader) anyIntegerTreeReader).reader.next(); - break; - default: - throw new RuntimeException("Unexpected type kind " + fileTypeCategory.name()); - } - return true; - } - protected long getLong() throws IOException { return longValue; } @@ -642,16 +432,6 @@ public static class AnyIntegerFromAnyIntegerTreeReader extends ConvertTreeReader downCastNeeded = integerDownCastNeeded(fileType, readerType); } - @Override - Object next(Object previous) throws IOException { - Writable result = null; - if (anyIntegerAsLongTreeReader.read()) { - long longValue = anyIntegerAsLongTreeReader.getLong(); - result = anyIntegerWritable(longValue, previous, readerType); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -703,20 +483,6 @@ public static class AnyIntegerFromFloatTreeReader extends ConvertTreeReader { floatResult = new FloatWritable(); } - @Override - Object next(Object previous) throws IOException { - - FloatWritable readfloatResult = - (FloatWritable) floatTreeReader.next(floatResult); - - Writable result = null; - if (readfloatResult != null) { - long longValue = (long) readfloatResult.get(); - result = anyIntegerWritable(longValue, previous, readerType); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { float floatValue = (float) doubleColVector.vector[elementNum]; @@ -746,7 +512,6 @@ public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader { private DoubleTreeReader doubleTreeReader; private final TypeDescription readerType; - private DoubleWritable doubleResult; private DoubleColumnVector doubleColVector; private LongColumnVector longColVector; @@ -756,21 +521,6 @@ public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader { this.readerType = readerType; doubleTreeReader = new DoubleTreeReader(columnId); setConvertTreeReader(doubleTreeReader); - doubleResult = new DoubleWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - DoubleWritable readDoubleResult = - (DoubleWritable) doubleTreeReader.next(doubleResult); - - Writable result = null; - if (readDoubleResult != null) { - long longValue = (long) readDoubleResult.get(); - result = anyIntegerWritable(longValue, previous, readerType); - } - return result; } @Override @@ -818,20 +568,6 @@ public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader { hiveDecimalResult = new HiveDecimalWritable(); } - @Override - Object next(Object previous) throws IOException { - - HiveDecimalWritable readHiveDecimalResult = - (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult); - - Writable result = null; - if (readHiveDecimalResult != null) { - long longValue = readHiveDecimalResult.getHiveDecimal().longValue(); - result = anyIntegerWritable(longValue, previous, readerType); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { longColVector.vector[elementNum] = @@ -862,7 +598,6 @@ public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReade private final TypeDescription fileType; private final TypeDescription readerType; - private Writable writable; private BytesColumnVector bytesColVector; private LongColumnVector longColVector; @@ -873,23 +608,6 @@ public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReade this.readerType = readerType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - Writable result = null; - if (stringValue != null) { - long longValue = parseLongFromString(stringValue); - if (!getIsParseError()) { - result = anyIntegerWritable(longValue, previous, readerType); - } - } - return result; } @Override @@ -926,7 +644,6 @@ public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampResult; private TimestampColumnVector timestampColVector; private LongColumnVector longColVector; @@ -936,29 +653,13 @@ public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader this.readerType = readerType; timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readHiveTimestampResult = - (TimestampWritable) timestampTreeReader.next(timestampResult); - - Writable result = null; - if (readHiveTimestampResult != null) { - // Use TimestampWritable's getSeconds. - long longValue = readHiveTimestampResult.getSeconds(); - result = anyIntegerWritable(longValue, previous, readerType); - } - return result; } @Override public void setConvertVectorElement(int elementNum) throws IOException { - timestampResult.set(timestampColVector.asScratchTimestamp(elementNum)); // Use TimestampWritable's getSeconds. - long longValue = timestampResult.getSeconds(); + long longValue = TimestampUtils.millisToSeconds( + timestampColVector.asScratchTimestamp(elementNum).getTime()); longColVector.vector[elementNum] = downCastAnyInteger(longValue, readerType); } @@ -994,24 +695,6 @@ public static class FloatFromAnyIntegerTreeReader extends ConvertTreeReader { setConvertTreeReader(anyIntegerAsLongTreeReader); } - @Override - Object next(Object previous) throws IOException { - FloatWritable result = null; - if (anyIntegerAsLongTreeReader.read()) { - long longValue = anyIntegerAsLongTreeReader.getLong(); - float floatValue = (float) longValue; - if (!Float.isNaN(floatValue)){ - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set(floatValue); - } - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { float floatValue = (float) longColVector.vector[elementNum]; @@ -1044,31 +727,10 @@ public static class FloatFromDoubleTreeReader extends ConvertTreeReader { private DoubleTreeReader doubleTreeReader; - private DoubleWritable doubleResult; - FloatFromDoubleTreeReader(int columnId) throws IOException { super(columnId); doubleTreeReader = new DoubleTreeReader(columnId); setConvertTreeReader(doubleTreeReader); - doubleResult = new DoubleWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - DoubleWritable readDoubleResult = - (DoubleWritable) doubleTreeReader.next(doubleResult); - - FloatWritable result = null; - if (readDoubleResult != null) { - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set((float) readDoubleResult.get()); - } - return result; } @Override @@ -1125,25 +787,6 @@ public static class FloatFromDecimalTreeReader extends ConvertTreeReader { hiveDecimalResult = new HiveDecimalWritable(); } - @Override - Object next(Object previous) throws IOException { - - HiveDecimalWritable readHiveDecimalResult = - (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult); - - FloatWritable result = null; - if (readHiveDecimalResult != null) { - double doubleValue = readHiveDecimalResult.getHiveDecimal().doubleValue(); - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set((float) doubleValue); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { doubleColVector.vector[elementNum] = @@ -1171,7 +814,6 @@ public static class FloatFromStringGroupTreeReader extends ConvertTreeReader { private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; private BytesColumnVector bytesColVector; private DoubleColumnVector doubleColVector; @@ -1181,28 +823,6 @@ public static class FloatFromStringGroupTreeReader extends ConvertTreeReader { this.fileType = fileType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - FloatWritable result = null; - if (stringValue != null) { - float floatValue = parseFloatFromString(stringValue); - if (!getIsParseError()) { - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set(floatValue); - } - } - return result; } @Override @@ -1239,7 +859,6 @@ public static class FloatFromTimestampTreeReader extends ConvertTreeReader { private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampResult; private TimestampColumnVector timestampColVector; private DoubleColumnVector doubleColVector; @@ -1249,32 +868,12 @@ public static class FloatFromTimestampTreeReader extends ConvertTreeReader { this.readerType = readerType; timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readTimestampResult = - (TimestampWritable) timestampTreeReader.next(timestampResult); - - FloatWritable result = null; - if (readTimestampResult != null) { - double doubleValue = readTimestampResult.getDouble(); - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set((float) doubleValue); - } - return result; } @Override public void setConvertVectorElement(int elementNum) throws IOException { - timestampResult.set(timestampColVector.asScratchTimestamp(elementNum)); - doubleColVector.vector[elementNum] = (float) timestampResult.getDouble(); + doubleColVector.vector[elementNum] = (float) TimestampUtils.getDouble( + timestampColVector.asScratchTimestamp(elementNum)); } @Override @@ -1308,24 +907,6 @@ public static class DoubleFromAnyIntegerTreeReader extends ConvertTreeReader { setConvertTreeReader(anyIntegerAsLongTreeReader); } - @Override - Object next(Object previous) throws IOException { - DoubleWritable result = null; - if (anyIntegerAsLongTreeReader.read()) { - long longValue = anyIntegerAsLongTreeReader.getLong(); - double doubleValue = (double) longValue; - if (!Double.isNaN(doubleValue)) { - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(doubleValue); - } - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { @@ -1368,24 +949,6 @@ public static class DoubleFromFloatTreeReader extends ConvertTreeReader { floatResult = new FloatWritable(); } - @Override - Object next(Object previous) throws IOException { - - FloatWritable readFloatResult = - (FloatWritable) floatTreeReader.next(floatResult); - - DoubleWritable result = null; - if (readFloatResult != null) { - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(readFloatResult.get()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1417,25 +980,6 @@ public static class DoubleFromDecimalTreeReader extends ConvertTreeReader { hiveDecimalResult = new HiveDecimalWritable(); } - @Override - Object next(Object previous) throws IOException { - - HiveDecimalWritable readHiveDecimalResult = - (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult); - - DoubleWritable result = null; - if (readHiveDecimalResult != null) { - double doubleValue = readHiveDecimalResult.getHiveDecimal().doubleValue(); - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(doubleValue); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { doubleColVector.vector[elementNum] = @@ -1463,7 +1007,6 @@ public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader { private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; private BytesColumnVector bytesColVector; private DoubleColumnVector doubleColVector; @@ -1473,28 +1016,6 @@ public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader { this.fileType = fileType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - DoubleWritable result = null; - if (stringValue != null) { - double doubleValue = parseDoubleFromString(stringValue); - if (!getIsParseError()) { - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(doubleValue); - } - } - return result; } @Override @@ -1530,7 +1051,6 @@ public static class DoubleFromTimestampTreeReader extends ConvertTreeReader { private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampResult; private TimestampColumnVector timestampColVector; private DoubleColumnVector doubleColVector; @@ -1540,32 +1060,12 @@ public static class DoubleFromTimestampTreeReader extends ConvertTreeReader { this.readerType = readerType; timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readTimestampResult = - (TimestampWritable) timestampTreeReader.next(timestampResult); - - DoubleWritable result = null; - if (readTimestampResult != null) { - double doubleValue = readTimestampResult.getDouble(); - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(doubleValue); - } - return result; } @Override public void setConvertVectorElement(int elementNum) throws IOException { - timestampResult.set(timestampColVector.asScratchTimestamp(elementNum)); - doubleColVector.vector[elementNum] = timestampResult.getDouble(); + doubleColVector.vector[elementNum] = TimestampUtils.getDouble( + timestampColVector.asScratchTimestamp(elementNum)); } @Override @@ -1603,16 +1103,6 @@ public static class DecimalFromAnyIntegerTreeReader extends ConvertTreeReader { setConvertTreeReader(anyIntegerAsLongTreeReader); } - @Override - Object next(Object previous) throws IOException { - HiveDecimalWritable result = null; - if (anyIntegerAsLongTreeReader.read()) { - long longValue = anyIntegerAsLongTreeReader.getLong(); - result = new HiveDecimalWritable(longValue); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { long longValue = longColVector.vector[elementNum]; @@ -1657,28 +1147,6 @@ public static class DecimalFromFloatTreeReader extends ConvertTreeReader { floatResult = new FloatWritable(); } - @Override - Object next(Object previous) throws IOException { - - FloatWritable readFloatResult = - (FloatWritable) floatTreeReader.next(floatResult); - - HiveDecimalWritable result = null; - if (readFloatResult != null) { - HiveDecimal value = - HiveDecimal.create(Float.toString(readFloatResult.get())); - if (value != null) { - if (previous == null) { - result = new HiveDecimalWritable(); - } else { - result = (HiveDecimalWritable) previous; - } - result.set(value); - } - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { float floatValue = (float) doubleColVector.vector[elementNum]; @@ -1717,42 +1185,14 @@ public static class DecimalFromDoubleTreeReader extends ConvertTreeReader { private DoubleTreeReader doubleTreeReader; - private int precision; - private int scale; - private DoubleWritable doubleResult; private DoubleColumnVector doubleColVector; private DecimalColumnVector decimalColVector; DecimalFromDoubleTreeReader(int columnId, TypeDescription readerType) throws IOException { super(columnId); - this.precision = readerType.getPrecision(); - this.scale = readerType.getScale(); doubleTreeReader = new DoubleTreeReader(columnId); setConvertTreeReader(doubleTreeReader); - doubleResult = new DoubleWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - DoubleWritable readDoubleResult = - (DoubleWritable) doubleTreeReader.next(doubleResult); - - HiveDecimalWritable result = null; - if (readDoubleResult != null) { - HiveDecimal value = - HiveDecimal.create(Double.toString(readDoubleResult.get())); - if (value != null) { - if (previous == null) { - result = new HiveDecimalWritable(); - } else { - result = (HiveDecimalWritable) previous; - } - result.set(value); - } - } - return result; } @Override @@ -1788,42 +1228,15 @@ public static class DecimalFromStringGroupTreeReader extends ConvertTreeReader { private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; private BytesColumnVector bytesColVector; - private int precision; - private int scale; private DecimalColumnVector decimalColVector; DecimalFromStringGroupTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType) throws IOException { super(columnId); this.fileType = fileType; - this.precision = readerType.getPrecision(); - this.scale = readerType.getScale(); stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - HiveDecimalWritable result = null; - if (stringValue != null) { - HiveDecimal value = parseDecimalFromString(stringValue); - if (value != null) { - if (previous == null) { - result = new HiveDecimalWritable(); - } else { - result = (HiveDecimalWritable) previous; - } - result.set(value, precision, scale); - } - } - return result; } @Override @@ -1859,7 +1272,6 @@ public static class DecimalFromTimestampTreeReader extends ConvertTreeReader { private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampResult; private TimestampColumnVector timestampColVector; private int precision; private int scale; @@ -1873,35 +1285,12 @@ public static class DecimalFromTimestampTreeReader extends ConvertTreeReader { this.scale = readerType.getScale(); timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readTimestampResult = - (TimestampWritable) timestampTreeReader.next(timestampResult); - - HiveDecimalWritable result = null; - if (readTimestampResult != null) { - double doubleValue = readTimestampResult.getDouble(); - HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue)); - if (value != null) { - if (previous == null) { - result = new HiveDecimalWritable(); - } else { - result = (HiveDecimalWritable) previous; - } - result.set(value, precision, scale); - } - } - return result; } @Override public void setConvertVectorElement(int elementNum) throws IOException { - timestampResult.set(timestampColVector.asScratchTimestamp(elementNum)); - double doubleValue = timestampResult.getDouble(); + double doubleValue = TimestampUtils.getDouble( + timestampColVector.asScratchTimestamp(elementNum)); HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue)); if (value != null) { decimalColVector.set(elementNum, value); @@ -1946,16 +1335,6 @@ public static class StringGroupFromAnyIntegerTreeReader extends ConvertTreeReade setConvertTreeReader(anyIntegerAsLongTreeReader); } - @Override - Object next(Object previous) throws IOException { - Writable result = null; - if (anyIntegerAsLongTreeReader.read()) { - result = getStringGroupResultFromString( - previous, readerType, anyIntegerAsLongTreeReader.getString()); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { long longValue = longColVector.vector[elementNum]; @@ -1999,23 +1378,6 @@ public static class StringGroupFromFloatTreeReader extends ConvertTreeReader { floatResult = new FloatWritable(); } - @Override - Object next(Object previous) throws IOException { - - FloatWritable readFloatResult = - (FloatWritable) floatTreeReader.next(floatResult); - - Writable result = null; - if (readFloatResult != null) { - float floatValue = readFloatResult.get(); - if (!Float.isNaN(floatValue)) { - result = getStringGroupResultFromString( - previous, readerType, String.valueOf(floatValue)); - } - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { float floatValue = (float) doubleColVector.vector[elementNum]; @@ -2050,7 +1412,6 @@ public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader { private DoubleTreeReader doubleTreeReader; private final TypeDescription readerType; - private DoubleWritable doubleResult; private DoubleColumnVector doubleColVector; private BytesColumnVector bytesColVector; @@ -2060,24 +1421,6 @@ public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader { this.readerType = readerType; doubleTreeReader = new DoubleTreeReader(columnId); setConvertTreeReader(doubleTreeReader); - doubleResult = new DoubleWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - DoubleWritable readDoubleResult = - (DoubleWritable) doubleTreeReader.next(doubleResult); - - Writable result = null; - if (readDoubleResult != null) { - double doubleValue = readDoubleResult.get(); - if (!Double.isNaN(doubleValue)) { - result = getStringGroupResultFromString( - previous, readerType, String.valueOf(doubleValue)); - } - } - return result; } @Override @@ -2118,7 +1461,6 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader { private int precision; private int scale; private final TypeDescription readerType; - private HiveDecimalWritable hiveDecimalResult; private DecimalColumnVector decimalColVector; private BytesColumnVector bytesColVector; @@ -2130,21 +1472,6 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader { this.readerType = readerType; decimalTreeReader = new DecimalTreeReader(columnId, precision, scale); setConvertTreeReader(decimalTreeReader); - hiveDecimalResult = new HiveDecimalWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - HiveDecimalWritable readHiveDecimalResult = - (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult); - - Writable result = null; - if (readHiveDecimalResult != null) { - result = getStringGroupResultFromString( - previous, readerType, readHiveDecimalResult.getHiveDecimal().toString()); - } - return result; } @Override @@ -2175,7 +1502,6 @@ public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampWritableResult; private TimestampColumnVector timestampColVector; private BytesColumnVector bytesColVector; @@ -2185,22 +1511,6 @@ public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader this.readerType = readerType; timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampWritableResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readTimestampWritableResult = - (TimestampWritable) timestampTreeReader.next(timestampWritableResult); - - Writable result = null; - if (readTimestampWritableResult != null) { - result = getStringGroupResultFromString( - previous, readerType, readTimestampWritableResult.toString()); - } - - return result; } @Override @@ -2247,21 +1557,6 @@ public static class StringGroupFromDateTreeReader extends ConvertTreeReader { date = new Date(0); } - @Override - Object next(Object previous) throws IOException { - - DateWritable readDateWritableResult = - (DateWritable) dateTreeReader.next(dateWritableResult); - - Writable result = null; - if (readDateWritableResult != null) { - result = getStringGroupResultFromString( - previous, readerType, readDateWritableResult.toString()); - } - - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum])); @@ -2292,7 +1587,6 @@ public static class StringGroupFromStringGroupTreeReader extends ConvertTreeRead private final TypeDescription fileType; private final TypeDescription readerType; - private Writable writable; StringGroupFromStringGroupTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType) throws IOException { @@ -2301,21 +1595,6 @@ public static class StringGroupFromStringGroupTreeReader extends ConvertTreeRead this.readerType = readerType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - Writable result = null; - if (stringValue != null) { - result = getStringGroupResultFromString( - previous, readerType, stringValue); - } - return result; } @Override @@ -2368,28 +1647,24 @@ public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader { binaryWritableResult = new BytesWritable(); } - @Override - Object next(Object previous) throws IOException { - - BytesWritable readBytesWritableResult = - (BytesWritable) binaryTreeReader.next(binaryWritableResult); - - Writable result = null; - if (readBytesWritableResult != null) { - result = getStringGroupResultFromString( - previous, readerType, readBytesWritableResult.toString()); - } - - return result; - } - @Override public void setConvertVectorElement(int elementNum) throws IOException { - // UNDONE: Binary to StringGroup conversion? byte[] bytes = inBytesColVector.vector[elementNum]; int start = inBytesColVector.start[elementNum]; int length = inBytesColVector.length[elementNum]; - assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType, bytes, start, length); + byte[] string = new byte[length == 0 ? 0 : 3 * length - 1]; + for(int p = 0; p < string.length; p += 2) { + if (p != 0) { + string[p++] = ' '; + } + int num = 0xff & bytes[start++]; + int digit = num / 16; + string[p] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10)); + digit = num % 16; + string[p + 1] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10)); + } + assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType, + string, 0, string.length); } @Override @@ -2423,27 +1698,11 @@ public static class TimestampFromAnyIntegerTreeReader extends ConvertTreeReader setConvertTreeReader(anyIntegerAsLongTreeReader); } - @Override - Object next(Object previous) throws IOException { - TimestampWritable result = null; - if (anyIntegerAsLongTreeReader.read()) { - long longValue = anyIntegerAsLongTreeReader.getLong(); - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - // UNDONE: What does the boolean setting need to be? - result.set(TimestampWritable.longToTimestamp(longValue, false)); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { long longValue = longColVector.vector[elementNum]; // UNDONE: What does the boolean setting need to be? - timestampColVector.set(elementNum, TimestampWritable.longToTimestamp(longValue, false)); + timestampColVector.set(elementNum, new Timestamp(longValue)); } @Override @@ -2478,30 +1737,11 @@ public static class TimestampFromFloatTreeReader extends ConvertTreeReader { floatResult = new FloatWritable(); } - @Override - Object next(Object previous) throws IOException { - - FloatWritable readFloatResult = - (FloatWritable) floatTreeReader.next(floatResult); - - TimestampWritable result = null; - if (readFloatResult != null) { - float floatValue = readFloatResult.get(); - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - result.set(TimestampWritable.doubleToTimestamp(floatValue)); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { float floatValue = (float) doubleColVector.vector[elementNum]; timestampColVector.set(elementNum, - TimestampWritable.doubleToTimestamp(floatValue)); + TimestampUtils.doubleToTimestamp(floatValue)); } @Override @@ -2524,7 +1764,6 @@ public static class TimestampFromDoubleTreeReader extends ConvertTreeReader { private DoubleTreeReader doubleTreeReader; - private DoubleWritable doubleResult; private DoubleColumnVector doubleColVector; private TimestampColumnVector timestampColVector; @@ -2533,33 +1772,13 @@ public static class TimestampFromDoubleTreeReader extends ConvertTreeReader { super(columnId); doubleTreeReader = new DoubleTreeReader(columnId); setConvertTreeReader(doubleTreeReader); - doubleResult = new DoubleWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - DoubleWritable readDoubleResult = - (DoubleWritable) doubleTreeReader.next(doubleResult); - - TimestampWritable result = null; - if (readDoubleResult != null) { - double doubleValue = readDoubleResult.get(); - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - result.set(TimestampWritable.doubleToTimestamp(doubleValue)); - } - return result; } @Override public void setConvertVectorElement(int elementNum) { double doubleValue = doubleColVector.vector[elementNum]; timestampColVector.set(elementNum, - TimestampWritable.doubleToTimestamp(doubleValue)); + TimestampUtils.doubleToTimestamp(doubleValue)); } @Override @@ -2598,31 +1817,10 @@ public static class TimestampFromDecimalTreeReader extends ConvertTreeReader { hiveDecimalResult = new HiveDecimalWritable(); } - @Override - Object next(Object previous) throws IOException { - - HiveDecimalWritable readHiveDecimalResult = - (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult); - - TimestampWritable result = null; - if (readHiveDecimalResult != null) { - Timestamp timestampValue = - TimestampWritable.decimalToTimestamp( - readHiveDecimalResult.getHiveDecimal()); - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - result.set(timestampValue); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { Timestamp timestampValue = - TimestampWritable.decimalToTimestamp( + TimestampUtils.decimalToTimestamp( decimalColVector.vector[elementNum].getHiveDecimal()); timestampColVector.set(elementNum, timestampValue); } @@ -2648,7 +1846,6 @@ public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; private BytesColumnVector bytesColVector; private TimestampColumnVector timestampColVector; @@ -2658,28 +1855,6 @@ public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader this.fileType = fileType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - TimestampWritable result = null; - if (stringValue != null) { - Timestamp timestampValue = parseTimestampFromString(stringValue); - if (timestampValue != null) { - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - result.set(timestampValue); - } - } - return result; } @Override @@ -2727,25 +1902,6 @@ public static class TimestampFromDateTreeReader extends ConvertTreeReader { doubleResult = new DateWritable(); } - @Override - Object next(Object previous) throws IOException { - - DateWritable readDateResult = - (DateWritable) dateTreeReader.next(doubleResult); - - TimestampWritable result = null; - if (readDateResult != null) { - Timestamp timestamp = new Timestamp(readDateResult.get().getTime()); - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - result.set(timestamp); - } - return result; - } - @Override public void setConvertVectorElement(int elementNum) { long millis = @@ -2774,7 +1930,6 @@ public static class DateFromStringGroupTreeReader extends ConvertTreeReader { private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; private BytesColumnVector bytesColVector; private LongColumnVector longColVector; @@ -2784,28 +1939,6 @@ public static class DateFromStringGroupTreeReader extends ConvertTreeReader { this.fileType = fileType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - DateWritable result = null; - if (stringValue != null) { - Date dateValue = parseDateFromString(stringValue); - if (dateValue != null) { - if (previous == null) { - result = new DateWritable(); - } else { - result = (DateWritable) previous; - } - result.set(dateValue); - } - } - return result; } @Override @@ -2842,7 +1975,6 @@ public static class DateFromTimestampTreeReader extends ConvertTreeReader { private TimestampTreeReader timestampTreeReader; private final TypeDescription readerType; - private TimestampWritable timestampResult; private TimestampColumnVector timestampColVector; private LongColumnVector longColVector; @@ -2852,34 +1984,13 @@ public static class DateFromTimestampTreeReader extends ConvertTreeReader { this.readerType = readerType; timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt); setConvertTreeReader(timestampTreeReader); - timestampResult = new TimestampWritable(); - } - - @Override - Object next(Object previous) throws IOException { - - TimestampWritable readTimestampResult = - (TimestampWritable) timestampTreeReader.next(timestampResult); - - DateWritable result = null; - if (readTimestampResult != null) { - Date dateValue = - DateWritable.timeToDate(readTimestampResult.getSeconds()); - if (previous == null) { - result = new DateWritable(); - } else { - result = (DateWritable) previous; - } - result.set(dateValue); - } - return result; } @Override public void setConvertVectorElement(int elementNum) throws IOException { - timestampResult.set(timestampColVector.asScratchTimestamp(elementNum)); Date dateValue = - DateWritable.timeToDate(timestampResult.getSeconds()); + DateWritable.timeToDate(TimestampUtils.millisToSeconds( + timestampColVector.asScratchTimestamp(elementNum).getTime())); longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue); } @@ -2904,7 +2015,6 @@ public static class BinaryFromStringGroupTreeReader extends ConvertTreeReader { private TreeReader stringGroupTreeReader; private final TypeDescription fileType; - private Writable writable; BinaryFromStringGroupTreeReader(int columnId, TypeDescription fileType) throws IOException { @@ -2912,26 +2022,6 @@ public static class BinaryFromStringGroupTreeReader extends ConvertTreeReader { this.fileType = fileType; stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType); setConvertTreeReader(stringGroupTreeReader); - writable = getStringGroupWritable(fileType); - } - - @Override - Object next(Object previous) throws IOException { - - String stringValue = stringFromStringGroupTreeReader( - stringGroupTreeReader, writable, fileType); - - BytesWritable result = null; - if (stringValue != null) { - byte[] bytes = stringValue.getBytes(); - if (previous == null) { - result = new BytesWritable(); - } else { - result = (BytesWritable) previous; - } - result.set(bytes, 0, bytes.length); - } - return result; } @Override @@ -3571,7 +2661,7 @@ private static TreeReader createBinaryConvertTreeReader(int columnId, * Input must be data type UNION * Convert value for tag * - * @param columnId + * @param readerType * @param evolution * @param included * @param skipCorrupt @@ -3747,4 +2837,4 @@ public static boolean canConvert(TypeDescription fileType, TypeDescription reade fileType.getCategory()); } } -} \ No newline at end of file +} diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims.java b/orc/src/java/org/apache/orc/impl/HadoopShims.java index 2980d7103988..ef7d70fb4005 100644 --- a/orc/src/java/org/apache/orc/impl/HadoopShims.java +++ b/orc/src/java/org/apache/orc/impl/HadoopShims.java @@ -18,9 +18,13 @@ package org.apache.orc.impl; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.io.Text; import org.apache.hadoop.util.VersionInfo; +import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; public interface HadoopShims { @@ -43,6 +47,81 @@ interface DirectDecompressor { */ DirectDecompressor getDirectDecompressor(DirectCompressionType codec); + /** + * a hadoop.io ByteBufferPool shim. + */ + public interface ByteBufferPoolShim { + /** + * Get a new ByteBuffer from the pool. The pool can provide this from + * removing a buffer from its internal cache, or by allocating a + * new buffer. + * + * @param direct Whether the buffer should be direct. + * @param length The minimum length the buffer will have. + * @return A new ByteBuffer. Its capacity can be less + * than what was requested, but must be at + * least 1 byte. + */ + ByteBuffer getBuffer(boolean direct, int length); + + /** + * Release a buffer back to the pool. + * The pool may choose to put this buffer into its cache/free it. + * + * @param buffer a direct bytebuffer + */ + void putBuffer(ByteBuffer buffer); + } + + /** + * Provides an HDFS ZeroCopyReader shim. + * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to) + * @param in ByteBufferPoolShim to allocate fallback buffers with + * + * @return returns null if not supported + */ + public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException; + + public interface ZeroCopyReaderShim extends Closeable { + /** + * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer. + * Also move the in stream by that amount. The data read can be small than maxLength. + * + * @return ByteBuffer read from the stream, + */ + public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException; + /** + * Release a ByteBuffer obtained from a read on the + * Also move the in stream by that amount. The data read can be small than maxLength. + * + */ + public void releaseBuffer(ByteBuffer buffer); + + /** + * Close the underlying stream. + * @throws IOException + */ + public void close() throws IOException; + } + /** + * Read data into a Text object in the fastest way possible + */ + public interface TextReaderShim { + /** + * @param txt + * @param size + * @return bytes read + * @throws IOException + */ + void read(Text txt, int size) throws IOException; + } + + /** + * Wrap a TextReaderShim around an input stream. The reader shim will not + * buffer any reads from the underlying stream and will only consume bytes + * which are required for TextReaderShim.read() input. + */ + public TextReaderShim getTextReaderShim(InputStream input) throws IOException; class Factory { private static HadoopShims SHIMS = null; diff --git a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java index 3b9371d5816b..5c53f744c7b9 100644 --- a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java +++ b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java @@ -18,10 +18,14 @@ package org.apache.orc.impl; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.snappy.SnappyDecompressor; import org.apache.hadoop.io.compress.zlib.ZlibDecompressor; +import java.io.DataInputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; /** @@ -59,4 +63,30 @@ public DirectDecompressor getDirectDecompressor( return null; } } + + @Override + public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, + ByteBufferPoolShim pool + ) throws IOException { + return ZeroCopyShims.getZeroCopyReader(in, pool); + } + + private final class FastTextReaderShim implements TextReaderShim { + private final DataInputStream din; + + public FastTextReaderShim(InputStream in) { + this.din = new DataInputStream(in); + } + + @Override + public void read(Text txt, int len) throws IOException { + txt.readWithKnownLength(din, len); + } + } + + @Override + public TextReaderShim getTextReaderShim(InputStream in) throws IOException { + return new FastTextReaderShim(in); + } + } diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java index ac4683601055..3f65e7447861 100644 --- a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java +++ b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java @@ -18,19 +18,84 @@ package org.apache.orc.impl; -import org.apache.hadoop.io.compress.snappy.SnappyDecompressor; -import org.apache.hadoop.io.compress.zlib.ZlibDecompressor; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.io.Text; +import java.io.EOFException; import java.io.IOException; -import java.nio.ByteBuffer; +import java.io.InputStream; +import java.lang.reflect.Method; /** * Shims for versions of Hadoop up to and including 2.2.x */ public class HadoopShims_2_2 implements HadoopShims { + final boolean zeroCopy; + final boolean fastRead; + + HadoopShims_2_2() { + boolean zcr = false; + try { + Class.forName("org.apache.hadoop.fs.CacheFlag", false, + HadoopShims_2_2.class.getClassLoader()); + zcr = true; + } catch (ClassNotFoundException ce) { + } + zeroCopy = zcr; + boolean fastRead = false; + if (zcr) { + for (Method m : Text.class.getMethods()) { + if ("readWithKnownLength".equals(m.getName())) { + fastRead = true; + } + } + } + this.fastRead = fastRead; + } + public DirectDecompressor getDirectDecompressor( DirectCompressionType codec) { return null; } + + @Override + public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, + ByteBufferPoolShim pool + ) throws IOException { + if(zeroCopy) { + return ZeroCopyShims.getZeroCopyReader(in, pool); + } + /* not supported */ + return null; + } + + private final class BasicTextReaderShim implements TextReaderShim { + private final InputStream in; + + public BasicTextReaderShim(InputStream in) { + this.in = in; + } + + @Override + public void read(Text txt, int len) throws IOException { + int offset = 0; + byte[] bytes = new byte[len]; + while (len > 0) { + int written = in.read(bytes, offset, len); + if (written < 0) { + throw new EOFException("Can't finish read from " + in + " read " + + (offset) + " bytes out of " + bytes.length); + } + len -= written; + offset += written; + } + txt.set(bytes); + } + } + + @Override + public TextReaderShim getTextReaderShim(InputStream in) throws IOException { + return new BasicTextReaderShim(in); + } } diff --git a/orc/src/java/org/apache/orc/impl/IntegerReader.java b/orc/src/java/org/apache/orc/impl/IntegerReader.java index 8bef0f150eb7..3e64d548c753 100644 --- a/orc/src/java/org/apache/orc/impl/IntegerReader.java +++ b/orc/src/java/org/apache/orc/impl/IntegerReader.java @@ -78,4 +78,5 @@ void nextVector(ColumnVector column, void nextVector(ColumnVector column, int[] data, int length - ) throws IOException;} + ) throws IOException; +} diff --git a/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java b/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java new file mode 100644 index 000000000000..72c7f547e993 --- /dev/null +++ b/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.orc.Reader; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; + +public class OrcAcidUtils { + public static final String ACID_STATS = "hive.acid.stats"; + public static final String DELTA_SIDE_FILE_SUFFIX = "_flush_length"; + + /** + * Get the filename of the ORC ACID side file that contains the lengths + * of the intermediate footers. + * @param main the main ORC filename + * @return the name of the side file + */ + public static Path getSideFile(Path main) { + return new Path(main + DELTA_SIDE_FILE_SUFFIX); + } + + /** + * Read the side file to get the last flush length. + * @param fs the file system to use + * @param deltaFile the path of the delta file + * @return the maximum size of the file to use + * @throws IOException + */ + public static long getLastFlushLength(FileSystem fs, + Path deltaFile) throws IOException { + Path lengths = getSideFile(deltaFile); + long result = Long.MAX_VALUE; + try (FSDataInputStream stream = fs.open(lengths)) { + result = -1; + while (stream.available() > 0) { + result = stream.readLong(); + } + return result; + } catch (IOException ioe) { + return result; + } + } + + private static final Charset utf8 = Charset.forName("UTF-8"); + private static final CharsetDecoder utf8Decoder = utf8.newDecoder(); + + public static AcidStats parseAcidStats(Reader reader) { + if (reader.hasMetadataValue(ACID_STATS)) { + try { + ByteBuffer val = reader.getMetadataValue(ACID_STATS).duplicate(); + return new AcidStats(utf8Decoder.decode(val).toString()); + } catch (CharacterCodingException e) { + throw new IllegalArgumentException("Bad string encoding for " + + ACID_STATS, e); + } + } else { + return null; + } + } + +} diff --git a/orc/src/java/org/apache/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/orc/impl/ReaderImpl.java new file mode 100644 index 000000000000..2da590e917bd --- /dev/null +++ b/orc/src/java/org/apache/orc/impl/ReaderImpl.java @@ -0,0 +1,758 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.orc.OrcFile; +import org.apache.orc.OrcUtils; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.CompressionCodec; +import org.apache.orc.FileFormatException; +import org.apache.orc.FileMetaInfo; +import org.apache.orc.FileMetadata; +import org.apache.orc.StripeInformation; +import org.apache.orc.StripeStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.hadoop.hive.ql.util.JavaDataModel; +import org.apache.hadoop.io.Text; +import org.apache.orc.OrcProto; + +import com.google.common.collect.Lists; +import com.google.protobuf.CodedInputStream; + +public class ReaderImpl implements Reader { + + private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class); + + private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; + + protected final FileSystem fileSystem; + private final long maxLength; + protected final Path path; + protected final org.apache.orc.CompressionKind compressionKind; + protected final CompressionCodec codec; + protected final int bufferSize; + private final List stripeStats; + private final int metadataSize; + protected final List types; + private final TypeDescription schema; + private final List userMetadata; + private final List fileStats; + private final List stripes; + protected final int rowIndexStride; + private final long contentLength, numberOfRows; + + + private long deserializedSize = -1; + protected final Configuration conf; + private final List versionList; + private final OrcFile.WriterVersion writerVersion; + + // Same for metastore cache - maintains the same background buffer, but includes postscript. + // This will only be set if the file footer/metadata was read from disk. + private final ByteBuffer footerMetaAndPsBuffer; + + public static class StripeInformationImpl + implements StripeInformation { + private final OrcProto.StripeInformation stripe; + + public StripeInformationImpl(OrcProto.StripeInformation stripe) { + this.stripe = stripe; + } + + @Override + public long getOffset() { + return stripe.getOffset(); + } + + @Override + public long getLength() { + return stripe.getDataLength() + getIndexLength() + getFooterLength(); + } + + @Override + public long getDataLength() { + return stripe.getDataLength(); + } + + @Override + public long getFooterLength() { + return stripe.getFooterLength(); + } + + @Override + public long getIndexLength() { + return stripe.getIndexLength(); + } + + @Override + public long getNumberOfRows() { + return stripe.getNumberOfRows(); + } + + @Override + public String toString() { + return "offset: " + getOffset() + " data: " + getDataLength() + + " rows: " + getNumberOfRows() + " tail: " + getFooterLength() + + " index: " + getIndexLength(); + } + } + + @Override + public long getNumberOfRows() { + return numberOfRows; + } + + @Override + public List getMetadataKeys() { + List result = new ArrayList(); + for(OrcProto.UserMetadataItem item: userMetadata) { + result.add(item.getName()); + } + return result; + } + + @Override + public ByteBuffer getMetadataValue(String key) { + for(OrcProto.UserMetadataItem item: userMetadata) { + if (item.hasName() && item.getName().equals(key)) { + return item.getValue().asReadOnlyByteBuffer(); + } + } + throw new IllegalArgumentException("Can't find user metadata " + key); + } + + public boolean hasMetadataValue(String key) { + for(OrcProto.UserMetadataItem item: userMetadata) { + if (item.hasName() && item.getName().equals(key)) { + return true; + } + } + return false; + } + + @Override + public org.apache.orc.CompressionKind getCompressionKind() { + return compressionKind; + } + + @Override + public int getCompressionSize() { + return bufferSize; + } + + @Override + public List getStripes() { + return stripes; + } + + @Override + public long getContentLength() { + return contentLength; + } + + @Override + public List getTypes() { + return types; + } + + @Override + public OrcFile.Version getFileVersion() { + for (OrcFile.Version version: OrcFile.Version.values()) { + if ((versionList != null && !versionList.isEmpty()) && + version.getMajor() == versionList.get(0) && + version.getMinor() == versionList.get(1)) { + return version; + } + } + return OrcFile.Version.V_0_11; + } + + @Override + public OrcFile.WriterVersion getWriterVersion() { + return writerVersion; + } + + @Override + public int getRowIndexStride() { + return rowIndexStride; + } + + @Override + public ColumnStatistics[] getStatistics() { + ColumnStatistics[] result = new ColumnStatistics[types.size()]; + for(int i=0; i < result.length; ++i) { + result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i)); + } + return result; + } + + @Override + public TypeDescription getSchema() { + return schema; + } + + /** + * Ensure this is an ORC file to prevent users from trying to read text + * files or RC files as ORC files. + * @param in the file being read + * @param path the filename for error messages + * @param psLen the postscript length + * @param buffer the tail of the file + * @throws IOException + */ + protected static void ensureOrcFooter(FSDataInputStream in, + Path path, + int psLen, + ByteBuffer buffer) throws IOException { + int magicLength = OrcFile.MAGIC.length(); + int fullLength = magicLength + 1; + if (psLen < fullLength || buffer.remaining() < fullLength) { + throw new FileFormatException("Malformed ORC file " + path + + ". Invalid postscript length " + psLen); + } + int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength; + byte[] array = buffer.array(); + // now look for the magic string at the end of the postscript. + if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) { + // If it isn't there, this may be the 0.11.0 version of ORC. + // Read the first 3 bytes of the file to check for the header + byte[] header = new byte[magicLength]; + in.readFully(0, header, 0, magicLength); + // if it isn't there, this isn't an ORC file + if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) { + throw new FileFormatException("Malformed ORC file " + path + + ". Invalid postscript."); + } + } + } + + /** + * Build a version string out of an array. + * @param version the version number as a list + * @return the human readable form of the version string + */ + private static String versionString(List version) { + StringBuilder buffer = new StringBuilder(); + for(int i=0; i < version.size(); ++i) { + if (i != 0) { + buffer.append('.'); + } + buffer.append(version.get(i)); + } + return buffer.toString(); + } + + /** + * Check to see if this ORC file is from a future version and if so, + * warn the user that we may not be able to read all of the column encodings. + * @param log the logger to write any error message to + * @param path the data source path for error messages + * @param version the version of hive that wrote the file. + */ + protected static void checkOrcVersion(Logger log, Path path, + List version) { + if (version.size() >= 1) { + int major = version.get(0); + int minor = 0; + if (version.size() >= 2) { + minor = version.get(1); + } + if (major > OrcFile.Version.CURRENT.getMajor() || + (major == OrcFile.Version.CURRENT.getMajor() && + minor > OrcFile.Version.CURRENT.getMinor())) { + log.warn(path + " was written by a future Hive version " + + versionString(version) + + ". This file may not be readable by this version of Hive."); + } + } + } + + /** + * Constructor that let's the user specify additional options. + * @param path pathname for file + * @param options options for reading + * @throws IOException + */ + public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { + FileSystem fs = options.getFilesystem(); + if (fs == null) { + fs = path.getFileSystem(options.getConfiguration()); + } + this.fileSystem = fs; + this.path = path; + this.conf = options.getConfiguration(); + this.maxLength = options.getMaxLength(); + + FileMetadata fileMetadata = options.getFileMetadata(); + if (fileMetadata != null) { + this.compressionKind = fileMetadata.getCompressionKind(); + this.bufferSize = fileMetadata.getCompressionBufferSize(); + this.codec = WriterImpl.createCodec(compressionKind); + this.metadataSize = fileMetadata.getMetadataSize(); + this.stripeStats = fileMetadata.getStripeStats(); + this.versionList = fileMetadata.getVersionList(); + this.writerVersion = + OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum()); + this.types = fileMetadata.getTypes(); + this.rowIndexStride = fileMetadata.getRowIndexStride(); + this.contentLength = fileMetadata.getContentLength(); + this.numberOfRows = fileMetadata.getNumberOfRows(); + this.fileStats = fileMetadata.getFileStats(); + this.stripes = fileMetadata.getStripes(); + this.userMetadata = null; // not cached and not needed here + this.footerMetaAndPsBuffer = null; + } else { + FileMetaInfo footerMetaData; + if (options.getFileMetaInfo() != null) { + footerMetaData = options.getFileMetaInfo(); + this.footerMetaAndPsBuffer = null; + } else { + footerMetaData = extractMetaInfoFromFooter(fs, path, + options.getMaxLength()); + this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; + } + MetaInfoObjExtractor rInfo = + new MetaInfoObjExtractor(footerMetaData.compressionType, + footerMetaData.bufferSize, + footerMetaData.metadataSize, + footerMetaData.footerBuffer + ); + this.compressionKind = rInfo.compressionKind; + this.codec = rInfo.codec; + this.bufferSize = rInfo.bufferSize; + this.metadataSize = rInfo.metadataSize; + this.stripeStats = rInfo.metadata.getStripeStatsList(); + this.types = rInfo.footer.getTypesList(); + this.rowIndexStride = rInfo.footer.getRowIndexStride(); + this.contentLength = rInfo.footer.getContentLength(); + this.numberOfRows = rInfo.footer.getNumberOfRows(); + this.userMetadata = rInfo.footer.getMetadataList(); + this.fileStats = rInfo.footer.getStatisticsList(); + this.versionList = footerMetaData.versionList; + this.writerVersion = footerMetaData.writerVersion; + this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); + } + this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0); + } + + /** + * Get the WriterVersion based on the ORC file postscript. + * @param writerVersion the integer writer version + * @return the version of the software that produced the file + */ + public static OrcFile.WriterVersion getWriterVersion(int writerVersion) { + for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) { + if (version.getId() == writerVersion) { + return version; + } + } + return OrcFile.WriterVersion.FUTURE; + } + + private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos, + int footerSize, CompressionCodec codec, int bufferSize) throws IOException { + bb.position(footerAbsPos); + bb.limit(footerAbsPos + footerSize); + return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer", + Lists.newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize)); + } + + private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, + int metadataSize, CompressionCodec codec, int bufferSize) throws IOException { + bb.position(metadataAbsPos); + bb.limit(metadataAbsPos + metadataSize); + return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata", + Lists.newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize)); + } + + private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path, + int psLen, int psAbsOffset) throws IOException { + // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here. + assert bb.hasArray(); + CodedInputStream in = CodedInputStream.newInstance( + bb.array(), bb.arrayOffset() + psAbsOffset, psLen); + OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in); + checkOrcVersion(LOG, path, ps.getVersionList()); + + // Check compression codec. + switch (ps.getCompression()) { + case NONE: + break; + case ZLIB: + break; + case SNAPPY: + break; + case LZO: + break; + default: + throw new IllegalArgumentException("Unknown compression"); + } + return ps; + } + + private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, + Path path, + long maxFileLength + ) throws IOException { + FSDataInputStream file = fs.open(path); + ByteBuffer buffer = null, fullFooterBuffer = null; + OrcProto.PostScript ps = null; + OrcFile.WriterVersion writerVersion = null; + try { + // figure out the size of the file using the option or filesystem + long size; + if (maxFileLength == Long.MAX_VALUE) { + size = fs.getFileStatus(path).getLen(); + } else { + size = maxFileLength; + } + + //read last bytes into buffer to get PostScript + int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); + buffer = ByteBuffer.allocate(readSize); + assert buffer.position() == 0; + file.readFully((size - readSize), + buffer.array(), buffer.arrayOffset(), readSize); + buffer.position(0); + + //read the PostScript + //get length of PostScript + int psLen = buffer.get(readSize - 1) & 0xff; + ensureOrcFooter(file, path, psLen, buffer); + int psOffset = readSize - 1 - psLen; + ps = extractPostScript(buffer, path, psLen, psOffset); + + int footerSize = (int) ps.getFooterLength(); + int metadataSize = (int) ps.getMetadataLength(); + writerVersion = extractWriterVersion(ps); + + //check if extra bytes need to be read + int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize); + if (extra > 0) { + //more bytes need to be read, seek back to the right place and read extra bytes + ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize); + file.readFully((size - readSize - extra), extraBuf.array(), + extraBuf.arrayOffset() + extraBuf.position(), extra); + extraBuf.position(extra); + //append with already read bytes + extraBuf.put(buffer); + buffer = extraBuf; + buffer.position(0); + fullFooterBuffer = buffer.slice(); + buffer.limit(footerSize + metadataSize); + } else { + //footer is already in the bytes in buffer, just adjust position, length + buffer.position(psOffset - footerSize - metadataSize); + fullFooterBuffer = buffer.slice(); + buffer.limit(psOffset); + } + + // remember position for later TODO: what later? this comment is useless + buffer.mark(); + } finally { + try { + file.close(); + } catch (IOException ex) { + LOG.error("Failed to close the file after another error", ex); + } + } + + return new FileMetaInfo( + ps.getCompression().toString(), + (int) ps.getCompressionBlockSize(), + (int) ps.getMetadataLength(), + buffer, + ps.getVersionList(), + writerVersion, + fullFooterBuffer + ); + } + + protected static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) { + return (ps.hasWriterVersion() + ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL); + } + + protected static List convertProtoStripesToStripes( + List stripes) { + List result = new ArrayList(stripes.size()); + for (OrcProto.StripeInformation info : stripes) { + result.add(new StripeInformationImpl(info)); + } + return result; + } + + /** + * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl + * from serialized fields. + * As the fields are final, the fields need to be initialized in the constructor and + * can't be done in some helper function. So this helper class is used instead. + * + */ + private static class MetaInfoObjExtractor{ + final org.apache.orc.CompressionKind compressionKind; + final CompressionCodec codec; + final int bufferSize; + final int metadataSize; + final OrcProto.Metadata metadata; + final OrcProto.Footer footer; + + MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, + ByteBuffer footerBuffer) throws IOException { + + this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase()); + this.bufferSize = bufferSize; + this.codec = WriterImpl.createCodec(compressionKind); + this.metadataSize = metadataSize; + + int position = footerBuffer.position(); + int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; + + this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize); + this.footer = extractFooter( + footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); + + footerBuffer.position(position); + } + } + + @Override + public ByteBuffer getSerializedFileFooter() { + return footerMetaAndPsBuffer; + } + + @Override + public RecordReader rows() throws IOException { + return rows(new Options()); + } + + @Override + public RecordReader rows(Options options) throws IOException { + LOG.info("Reading ORC rows from " + path + " with " + options); + boolean[] include = options.getInclude(); + // if included columns is null, then include all columns + if (include == null) { + include = new boolean[types.size()]; + Arrays.fill(include, true); + options.include(include); + } + return new RecordReaderImpl(this, options); + } + + + @Override + public long getRawDataSize() { + // if the deserializedSize is not computed, then compute it, else + // return the already computed size. since we are reading from the footer + // we don't have to compute deserialized size repeatedly + if (deserializedSize == -1) { + List indices = Lists.newArrayList(); + for (int i = 0; i < fileStats.size(); ++i) { + indices.add(i); + } + deserializedSize = getRawDataSizeFromColIndices(indices); + } + return deserializedSize; + } + + @Override + public long getRawDataSizeFromColIndices(List colIndices) { + return getRawDataSizeFromColIndices(colIndices, types, fileStats); + } + + public static long getRawDataSizeFromColIndices( + List colIndices, List types, + List stats) { + long result = 0; + for (int colIdx : colIndices) { + result += getRawDataSizeOfColumn(colIdx, types, stats); + } + return result; + } + + private static long getRawDataSizeOfColumn(int colIdx, List types, + List stats) { + OrcProto.ColumnStatistics colStat = stats.get(colIdx); + long numVals = colStat.getNumberOfValues(); + OrcProto.Type type = types.get(colIdx); + + switch (type.getKind()) { + case BINARY: + // old orc format doesn't support binary statistics. checking for binary + // statistics is not required as protocol buffers takes care of it. + return colStat.getBinaryStatistics().getSum(); + case STRING: + case CHAR: + case VARCHAR: + // old orc format doesn't support sum for string statistics. checking for + // existence is not required as protocol buffers takes care of it. + + // ORC strings are deserialized to java strings. so use java data model's + // string size + numVals = numVals == 0 ? 1 : numVals; + int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals); + return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen); + case TIMESTAMP: + return numVals * JavaDataModel.get().lengthOfTimestamp(); + case DATE: + return numVals * JavaDataModel.get().lengthOfDate(); + case DECIMAL: + return numVals * JavaDataModel.get().lengthOfDecimal(); + case DOUBLE: + case LONG: + return numVals * JavaDataModel.get().primitive2(); + case FLOAT: + case INT: + case SHORT: + case BOOLEAN: + case BYTE: + return numVals * JavaDataModel.get().primitive1(); + default: + LOG.debug("Unknown primitive category: " + type.getKind()); + break; + } + + return 0; + } + + @Override + public long getRawDataSizeOfColumns(List colNames) { + List colIndices = getColumnIndicesFromNames(colNames); + return getRawDataSizeFromColIndices(colIndices); + } + + private List getColumnIndicesFromNames(List colNames) { + // top level struct + OrcProto.Type type = types.get(0); + List colIndices = Lists.newArrayList(); + List fieldNames = type.getFieldNamesList(); + int fieldIdx; + for (String colName : colNames) { + if (fieldNames.contains(colName)) { + fieldIdx = fieldNames.indexOf(colName); + } else { + String s = "Cannot find field for: " + colName + " in "; + for (String fn : fieldNames) { + s += fn + ", "; + } + LOG.warn(s); + continue; + } + + // a single field may span multiple columns. find start and end column + // index for the requested field + int idxStart = type.getSubtypes(fieldIdx); + + int idxEnd; + + // if the specified is the last field and then end index will be last + // column index + if (fieldIdx + 1 > fieldNames.size() - 1) { + idxEnd = getLastIdx() + 1; + } else { + idxEnd = type.getSubtypes(fieldIdx + 1); + } + + // if start index and end index are same then the field is a primitive + // field else complex field (like map, list, struct, union) + if (idxStart == idxEnd) { + // simple field + colIndices.add(idxStart); + } else { + // complex fields spans multiple columns + for (int i = idxStart; i < idxEnd; i++) { + colIndices.add(i); + } + } + } + return colIndices; + } + + private int getLastIdx() { + Set indices = new HashSet<>(); + for (OrcProto.Type type : types) { + indices.addAll(type.getSubtypesList()); + } + return Collections.max(indices); + } + + @Override + public List getOrcProtoStripeStatistics() { + return stripeStats; + } + + @Override + public List getOrcProtoFileStatistics() { + return fileStats; + } + + @Override + public List getStripeStatistics() { + List result = new ArrayList<>(); + for (OrcProto.StripeStatistics ss : stripeStats) { + result.add(new StripeStatistics(ss.getColStatsList())); + } + return result; + } + + public List getOrcProtoUserMetadata() { + return userMetadata; + } + + @Override + public List getVersionList() { + return versionList; + } + + @Override + public int getMetadataSize() { + return metadataSize; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append("ORC Reader("); + buffer.append(path); + if (maxLength != -1) { + buffer.append(", "); + buffer.append(maxLength); + } + buffer.append(")"); + return buffer.toString(); + } +} diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java new file mode 100644 index 000000000000..36a802e4dda6 --- /dev/null +++ b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -0,0 +1,1215 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import java.io.IOException; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.orc.BooleanColumnStatistics; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.CompressionCodec; +import org.apache.orc.DataReader; +import org.apache.orc.DateColumnStatistics; +import org.apache.orc.DecimalColumnStatistics; +import org.apache.orc.DoubleColumnStatistics; +import org.apache.orc.IntegerColumnStatistics; +import org.apache.orc.OrcConf; +import org.apache.orc.StringColumnStatistics; +import org.apache.orc.StripeInformation; +import org.apache.orc.TimestampColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.BloomFilterIO; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.ql.util.TimestampUtils; +import org.apache.hadoop.io.Text; +import org.apache.orc.OrcProto; + +public class RecordReaderImpl implements RecordReader { + static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class); + private static final boolean isLogDebugEnabled = LOG.isDebugEnabled(); + private static final Object UNKNOWN_VALUE = new Object(); + protected final Path path; + private final long firstRow; + private final List stripes = + new ArrayList(); + private OrcProto.StripeFooter stripeFooter; + private final long totalRowCount; + private final CompressionCodec codec; + protected final TypeDescription schema; + private final List types; + private final int bufferSize; + private final boolean[] included; + private final long rowIndexStride; + private long rowInStripe = 0; + private int currentStripe = -1; + private long rowBaseInStripe = 0; + private long rowCountInStripe = 0; + private final Map streams = + new HashMap(); + DiskRangeList bufferChunks = null; + private final TreeReaderFactory.TreeReader reader; + private final OrcProto.RowIndex[] indexes; + private final OrcProto.BloomFilterIndex[] bloomFilterIndices; + private final SargApplier sargApp; + // an array about which row groups aren't skipped + private boolean[] includedRowGroups = null; + private final DataReader dataReader; + + /** + * Given a list of column names, find the given column and return the index. + * + * @param columnNames the list of potential column names + * @param columnName the column name to look for + * @param rootColumn offset the result with the rootColumn + * @return the column number or -1 if the column wasn't found + */ + static int findColumns(String[] columnNames, + String columnName, + int rootColumn) { + for(int i=0; i < columnNames.length; ++i) { + if (columnName.equals(columnNames[i])) { + return i + rootColumn; + } + } + return -1; + } + + /** + * Find the mapping from predicate leaves to columns. + * @param sargLeaves the search argument that we need to map + * @param columnNames the names of the columns + * @param rootColumn the offset of the top level row, which offsets the + * result + * @return an array mapping the sarg leaves to concrete column numbers + */ + public static int[] mapSargColumnsToOrcInternalColIdx(List sargLeaves, + String[] columnNames, + int rootColumn) { + int[] result = new int[sargLeaves.size()]; + Arrays.fill(result, -1); + for(int i=0; i < result.length; ++i) { + String colName = sargLeaves.get(i).getColumnName(); + result[i] = findColumns(columnNames, colName, rootColumn); + } + return result; + } + + protected RecordReaderImpl(ReaderImpl fileReader, + Reader.Options options) throws IOException { + SchemaEvolution treeReaderSchema; + this.included = options.getInclude(); + included[0] = true; + if (options.getSchema() == null) { + if (LOG.isInfoEnabled()) { + LOG.info("Schema on read not provided -- using file schema " + + fileReader.getSchema()); + } + treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), included); + } else { + + // Now that we are creating a record reader for a file, validate that the schema to read + // is compatible with the file schema. + // + treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), + options.getSchema(),included); + } + this.schema = treeReaderSchema.getReaderSchema(); + this.path = fileReader.path; + this.codec = fileReader.codec; + this.types = fileReader.types; + this.bufferSize = fileReader.bufferSize; + this.rowIndexStride = fileReader.rowIndexStride; + SearchArgument sarg = options.getSearchArgument(); + if (sarg != null && rowIndexStride != 0) { + sargApp = new SargApplier( + sarg, options.getColumnNames(), rowIndexStride, types, + included.length); + } else { + sargApp = null; + } + long rows = 0; + long skippedRows = 0; + long offset = options.getOffset(); + long maxOffset = options.getMaxOffset(); + for(StripeInformation stripe: fileReader.getStripes()) { + long stripeStart = stripe.getOffset(); + if (offset > stripeStart) { + skippedRows += stripe.getNumberOfRows(); + } else if (stripeStart < maxOffset) { + this.stripes.add(stripe); + rows += stripe.getNumberOfRows(); + } + } + + Boolean zeroCopy = options.getUseZeroCopy(); + if (zeroCopy == null) { + zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf); + } + if (options.getDataReader() != null) { + this.dataReader = options.getDataReader(); + } else { + this.dataReader = RecordReaderUtils.createDefaultDataReader( + DataReaderProperties.builder() + .withBufferSize(bufferSize) + .withCompression(fileReader.compressionKind) + .withFileSystem(fileReader.fileSystem) + .withPath(fileReader.path) + .withTypeCount(types.size()) + .withZeroCopy(zeroCopy) + .build()); + } + this.dataReader.open(); + + firstRow = skippedRows; + totalRowCount = rows; + Boolean skipCorrupt = options.getSkipCorruptRecords(); + if (skipCorrupt == null) { + skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf); + } + + reader = TreeReaderFactory.createTreeReader(treeReaderSchema.getReaderSchema(), + treeReaderSchema, included, skipCorrupt); + indexes = new OrcProto.RowIndex[types.size()]; + bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()]; + advanceToNextRow(reader, 0L, true); + } + + public static final class PositionProviderImpl implements PositionProvider { + private final OrcProto.RowIndexEntry entry; + private int index; + + public PositionProviderImpl(OrcProto.RowIndexEntry entry) { + this(entry, 0); + } + + public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) { + this.entry = entry; + this.index = startPos; + } + + @Override + public long getNext() { + return entry.getPositions(index++); + } + } + + public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe + ) throws IOException { + return dataReader.readStripeFooter(stripe); + } + + enum Location { + BEFORE, MIN, MIDDLE, MAX, AFTER + } + + /** + * Given a point and min and max, determine if the point is before, at the + * min, in the middle, at the max, or after the range. + * @param point the point to test + * @param min the minimum point + * @param max the maximum point + * @param the type of the comparision + * @return the location of the point + */ + static Location compareToRange(Comparable point, T min, T max) { + int minCompare = point.compareTo(min); + if (minCompare < 0) { + return Location.BEFORE; + } else if (minCompare == 0) { + return Location.MIN; + } + int maxCompare = point.compareTo(max); + if (maxCompare > 0) { + return Location.AFTER; + } else if (maxCompare == 0) { + return Location.MAX; + } + return Location.MIDDLE; + } + + /** + * Get the maximum value out of an index entry. + * @param index + * the index entry + * @return the object for the maximum value or null if there isn't one + */ + static Object getMax(ColumnStatistics index) { + if (index instanceof IntegerColumnStatistics) { + return ((IntegerColumnStatistics) index).getMaximum(); + } else if (index instanceof DoubleColumnStatistics) { + return ((DoubleColumnStatistics) index).getMaximum(); + } else if (index instanceof StringColumnStatistics) { + return ((StringColumnStatistics) index).getMaximum(); + } else if (index instanceof DateColumnStatistics) { + return ((DateColumnStatistics) index).getMaximum(); + } else if (index instanceof DecimalColumnStatistics) { + return ((DecimalColumnStatistics) index).getMaximum(); + } else if (index instanceof TimestampColumnStatistics) { + return ((TimestampColumnStatistics) index).getMaximum(); + } else if (index instanceof BooleanColumnStatistics) { + if (((BooleanColumnStatistics)index).getTrueCount()!=0) { + return Boolean.TRUE; + } else { + return Boolean.FALSE; + } + } else { + return null; + } + } + + /** + * Get the minimum value out of an index entry. + * @param index + * the index entry + * @return the object for the minimum value or null if there isn't one + */ + static Object getMin(ColumnStatistics index) { + if (index instanceof IntegerColumnStatistics) { + return ((IntegerColumnStatistics) index).getMinimum(); + } else if (index instanceof DoubleColumnStatistics) { + return ((DoubleColumnStatistics) index).getMinimum(); + } else if (index instanceof StringColumnStatistics) { + return ((StringColumnStatistics) index).getMinimum(); + } else if (index instanceof DateColumnStatistics) { + return ((DateColumnStatistics) index).getMinimum(); + } else if (index instanceof DecimalColumnStatistics) { + return ((DecimalColumnStatistics) index).getMinimum(); + } else if (index instanceof TimestampColumnStatistics) { + return ((TimestampColumnStatistics) index).getMinimum(); + } else if (index instanceof BooleanColumnStatistics) { + if (((BooleanColumnStatistics)index).getFalseCount()!=0) { + return Boolean.FALSE; + } else { + return Boolean.TRUE; + } + } else { + return UNKNOWN_VALUE; // null is not safe here + } + } + + /** + * Evaluate a predicate with respect to the statistics from the column + * that is referenced in the predicate. + * @param statsProto the statistics for the column mentioned in the predicate + * @param predicate the leaf predicate we need to evaluation + * @param bloomFilter + * @return the set of truth values that may be returned for the given + * predicate. + */ + static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, + PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) { + ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto); + Object minValue = getMin(cs); + Object maxValue = getMax(cs); + BloomFilterIO bf = null; + if (bloomFilter != null) { + bf = new BloomFilterIO(bloomFilter); + } + return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf); + } + + /** + * Evaluate a predicate with respect to the statistics from the column + * that is referenced in the predicate. + * @param stats the statistics for the column mentioned in the predicate + * @param predicate the leaf predicate we need to evaluation + * @return the set of truth values that may be returned for the given + * predicate. + */ + public static TruthValue evaluatePredicate(ColumnStatistics stats, + PredicateLeaf predicate, + BloomFilterIO bloomFilter) { + Object minValue = getMin(stats); + Object maxValue = getMax(stats); + return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter); + } + + static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min, + Object max, boolean hasNull, BloomFilterIO bloomFilter) { + // if we didn't have any values, everything must have been null + if (min == null) { + if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) { + return TruthValue.YES; + } else { + return TruthValue.NULL; + } + } else if (min == UNKNOWN_VALUE) { + return TruthValue.YES_NO_NULL; + } + + TruthValue result; + Object baseObj = predicate.getLiteral(); + try { + // Predicate object and stats objects are converted to the type of the predicate object. + Object minValue = getBaseObjectForComparison(predicate.getType(), min); + Object maxValue = getBaseObjectForComparison(predicate.getType(), max); + Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj); + + result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull); + if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) { + result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull); + } + // in case failed conversion, return the default YES_NO_NULL truth value + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + final String statsType = min == null ? + (max == null ? "null" : max.getClass().getSimpleName()) : + min.getClass().getSimpleName(); + final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName(); + final String reason = e.getClass().getSimpleName() + " when evaluating predicate." + + " Skipping ORC PPD." + + " Exception: " + e.getMessage() + + " StatsType: " + statsType + + " PredicateType: " + predicateType; + LOG.warn(reason); + LOG.debug(reason, e); + } + if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) { + result = TruthValue.YES_NO; + } else { + result = TruthValue.YES_NO_NULL; + } + } + return result; + } + + private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate, + TruthValue result, BloomFilterIO bloomFilter) { + // evaluate bloom filter only when + // 1) Bloom filter is available + // 2) Min/Max evaluation yield YES or MAYBE + // 3) Predicate is EQUALS or IN list + if (bloomFilter != null + && result != TruthValue.NO_NULL && result != TruthValue.NO + && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS) + || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) + || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) { + return true; + } + return false; + } + + private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj, + Object minValue, + Object maxValue, + boolean hasNull) { + Location loc; + + switch (predicate.getOperator()) { + case NULL_SAFE_EQUALS: + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (loc == Location.BEFORE || loc == Location.AFTER) { + return TruthValue.NO; + } else { + return TruthValue.YES_NO; + } + case EQUALS: + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (minValue.equals(maxValue) && loc == Location.MIN) { + return hasNull ? TruthValue.YES_NULL : TruthValue.YES; + } else if (loc == Location.BEFORE || loc == Location.AFTER) { + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + case LESS_THAN: + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (loc == Location.AFTER) { + return hasNull ? TruthValue.YES_NULL : TruthValue.YES; + } else if (loc == Location.BEFORE || loc == Location.MIN) { + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + case LESS_THAN_EQUALS: + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (loc == Location.AFTER || loc == Location.MAX) { + return hasNull ? TruthValue.YES_NULL : TruthValue.YES; + } else if (loc == Location.BEFORE) { + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + case IN: + if (minValue.equals(maxValue)) { + // for a single value, look through to see if that value is in the + // set + for (Object arg : predicate.getLiteralList()) { + predObj = getBaseObjectForComparison(predicate.getType(), arg); + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (loc == Location.MIN) { + return hasNull ? TruthValue.YES_NULL : TruthValue.YES; + } + } + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + // are all of the values outside of the range? + for (Object arg : predicate.getLiteralList()) { + predObj = getBaseObjectForComparison(predicate.getType(), arg); + loc = compareToRange((Comparable) predObj, minValue, maxValue); + if (loc == Location.MIN || loc == Location.MIDDLE || + loc == Location.MAX) { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + } + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } + case BETWEEN: + List args = predicate.getLiteralList(); + Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0)); + + loc = compareToRange((Comparable) predObj1, minValue, maxValue); + if (loc == Location.BEFORE || loc == Location.MIN) { + Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1)); + + Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue); + if (loc2 == Location.AFTER || loc2 == Location.MAX) { + return hasNull ? TruthValue.YES_NULL : TruthValue.YES; + } else if (loc2 == Location.BEFORE) { + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + } else if (loc == Location.AFTER) { + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + } else { + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + case IS_NULL: + // min = null condition above handles the all-nulls YES case + return hasNull ? TruthValue.YES_NO : TruthValue.NO; + default: + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + } + + private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate, + final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) { + switch (predicate.getOperator()) { + case NULL_SAFE_EQUALS: + // null safe equals does not return *_NULL variant. So set hasNull to false + return checkInBloomFilter(bloomFilter, predObj, false); + case EQUALS: + return checkInBloomFilter(bloomFilter, predObj, hasNull); + case IN: + for (Object arg : predicate.getLiteralList()) { + // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe + Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg); + TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull); + if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) { + return result; + } + } + return hasNull ? TruthValue.NO_NULL : TruthValue.NO; + default: + return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } + } + + private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) { + TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO; + + if (predObj instanceof Long) { + if (bf.testLong(((Long) predObj).longValue())) { + result = TruthValue.YES_NO_NULL; + } + } else if (predObj instanceof Double) { + if (bf.testDouble(((Double) predObj).doubleValue())) { + result = TruthValue.YES_NO_NULL; + } + } else if (predObj instanceof String || predObj instanceof Text || + predObj instanceof HiveDecimalWritable || + predObj instanceof BigDecimal) { + if (bf.testString(predObj.toString())) { + result = TruthValue.YES_NO_NULL; + } + } else if (predObj instanceof Timestamp) { + if (bf.testLong(((Timestamp) predObj).getTime())) { + result = TruthValue.YES_NO_NULL; + } + } else if (predObj instanceof Date) { + if (bf.testLong(DateWritable.dateToDays((Date) predObj))) { + result = TruthValue.YES_NO_NULL; + } + } else { + // if the predicate object is null and if hasNull says there are no nulls then return NO + if (predObj == null && !hasNull) { + result = TruthValue.NO; + } else { + result = TruthValue.YES_NO_NULL; + } + } + + if (result == TruthValue.YES_NO_NULL && !hasNull) { + result = TruthValue.YES_NO; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Bloom filter evaluation: " + result.toString()); + } + + return result; + } + + private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) { + if (obj == null) { + return null; + } + switch (type) { + case BOOLEAN: + if (obj instanceof Boolean) { + return obj; + } else { + // will only be true if the string conversion yields "true", all other values are + // considered false + return Boolean.valueOf(obj.toString()); + } + case DATE: + if (obj instanceof Date) { + return obj; + } else if (obj instanceof String) { + return Date.valueOf((String) obj); + } else if (obj instanceof Timestamp) { + return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L); + } + // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?) + break; + case DECIMAL: + if (obj instanceof Boolean) { + return new HiveDecimalWritable(((Boolean) obj).booleanValue() ? + HiveDecimal.ONE : HiveDecimal.ZERO); + } else if (obj instanceof Integer) { + return new HiveDecimalWritable(((Integer) obj).intValue()); + } else if (obj instanceof Long) { + return new HiveDecimalWritable(((Long) obj)); + } else if (obj instanceof Float || obj instanceof Double || + obj instanceof String) { + return new HiveDecimalWritable(obj.toString()); + } else if (obj instanceof BigDecimal) { + return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj)); + } else if (obj instanceof HiveDecimal) { + return new HiveDecimalWritable((HiveDecimal) obj); + } else if (obj instanceof HiveDecimalWritable) { + return obj; + } else if (obj instanceof Timestamp) { + return new HiveDecimalWritable(Double.toString( + TimestampUtils.getDouble((Timestamp) obj))); + } + break; + case FLOAT: + if (obj instanceof Number) { + // widening conversion + return ((Number) obj).doubleValue(); + } else if (obj instanceof HiveDecimal) { + return ((HiveDecimal) obj).doubleValue(); + } else if (obj instanceof String) { + return Double.valueOf(obj.toString()); + } else if (obj instanceof Timestamp) { + return TimestampUtils.getDouble((Timestamp) obj); + } else if (obj instanceof HiveDecimal) { + return ((HiveDecimal) obj).doubleValue(); + } else if (obj instanceof BigDecimal) { + return ((BigDecimal) obj).doubleValue(); + } + break; + case LONG: + if (obj instanceof Number) { + // widening conversion + return ((Number) obj).longValue(); + } else if (obj instanceof HiveDecimal) { + return ((HiveDecimal) obj).longValue(); + } else if (obj instanceof String) { + return Long.valueOf(obj.toString()); + } + break; + case STRING: + if (obj != null) { + return (obj.toString()); + } + break; + case TIMESTAMP: + if (obj instanceof Timestamp) { + return obj; + } else if (obj instanceof Integer) { + return new Timestamp(((Number) obj).longValue()); + } else if (obj instanceof Float) { + return TimestampUtils.doubleToTimestamp(((Float) obj).doubleValue()); + } else if (obj instanceof Double) { + return TimestampUtils.doubleToTimestamp(((Double) obj).doubleValue()); + } else if (obj instanceof HiveDecimal) { + return TimestampUtils.decimalToTimestamp((HiveDecimal) obj); + } else if (obj instanceof HiveDecimalWritable) { + return TimestampUtils.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal()); + } else if (obj instanceof Date) { + return new Timestamp(((Date) obj).getTime()); + } + // float/double conversion to timestamp is interpreted as seconds whereas integer conversion + // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting + // is also config driven. The filter operator changes its promotion based on config: + // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases. + break; + default: + break; + } + + throw new IllegalArgumentException(String.format( + "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass() + .getSimpleName(), type)); + } + + public static class SargApplier { + public final static boolean[] READ_ALL_RGS = null; + public final static boolean[] READ_NO_RGS = new boolean[0]; + + private final SearchArgument sarg; + private final List sargLeaves; + private final int[] filterColumns; + private final long rowIndexStride; + // same as the above array, but indices are set to true + private final boolean[] sargColumns; + + public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride, + List types, int includedCount) { + this.sarg = sarg; + sargLeaves = sarg.getLeaves(); + filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0); + this.rowIndexStride = rowIndexStride; + // included will not be null, row options will fill the array with trues if null + sargColumns = new boolean[includedCount]; + for (int i : filterColumns) { + // filter columns may have -1 as index which could be partition column in SARG. + if (i > 0) { + sargColumns[i] = true; + } + } + } + + /** + * Pick the row groups that we need to load from the current stripe. + * + * @return an array with a boolean for each row group or null if all of the + * row groups must be read. + * @throws IOException + */ + public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes, + OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException { + long rowsInStripe = stripe.getNumberOfRows(); + int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); + boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc? + TruthValue[] leafValues = new TruthValue[sargLeaves.size()]; + boolean hasSelected = false, hasSkipped = false; + for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) { + for (int pred = 0; pred < leafValues.length; ++pred) { + int columnIx = filterColumns[pred]; + if (columnIx != -1) { + if (indexes[columnIx] == null) { + throw new AssertionError("Index is not populated for " + columnIx); + } + OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup); + if (entry == null) { + throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup); + } + OrcProto.ColumnStatistics stats = entry.getStatistics(); + OrcProto.BloomFilter bf = null; + if (bloomFilterIndices != null && bloomFilterIndices[filterColumns[pred]] != null) { + bf = bloomFilterIndices[filterColumns[pred]].getBloomFilter(rowGroup); + } + leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf); + if (LOG.isTraceEnabled()) { + LOG.trace("Stats = " + stats); + LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]); + } + } else { + // the column is a virtual column + leafValues[pred] = TruthValue.YES_NO_NULL; + } + } + result[rowGroup] = sarg.evaluate(leafValues).isNeeded(); + hasSelected = hasSelected || result[rowGroup]; + hasSkipped = hasSkipped || (!result[rowGroup]); + if (LOG.isDebugEnabled()) { + LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " + + (rowIndexStride * (rowGroup + 1) - 1) + " is " + + (result[rowGroup] ? "" : "not ") + "included."); + } + } + + return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS; + } + } + + /** + * Pick the row groups that we need to load from the current stripe. + * + * @return an array with a boolean for each row group or null if all of the + * row groups must be read. + * @throws IOException + */ + protected boolean[] pickRowGroups() throws IOException { + // if we don't have a sarg or indexes, we read everything + if (sargApp == null) { + return null; + } + readRowIndex(currentStripe, included, sargApp.sargColumns); + return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false); + } + + private void clearStreams() { + // explicit close of all streams to de-ref ByteBuffers + for (InStream is : streams.values()) { + is.close(); + } + if (bufferChunks != null) { + if (dataReader.isTrackingDiskRanges()) { + for (DiskRangeList range = bufferChunks; range != null; range = range.next) { + if (!(range instanceof BufferChunk)) { + continue; + } + dataReader.releaseBuffer(((BufferChunk) range).getChunk()); + } + } + } + bufferChunks = null; + streams.clear(); + } + + /** + * Read the current stripe into memory. + * + * @throws IOException + */ + private void readStripe() throws IOException { + StripeInformation stripe = beginReadStripe(); + includedRowGroups = pickRowGroups(); + + // move forward to the first unskipped row + if (includedRowGroups != null) { + while (rowInStripe < rowCountInStripe && + !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) { + rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride); + } + } + + // if we haven't skipped the whole stripe, read the data + if (rowInStripe < rowCountInStripe) { + // if we aren't projecting columns or filtering rows, just read it all + if (included == null && includedRowGroups == null) { + readAllDataStreams(stripe); + } else { + readPartialDataStreams(stripe); + } + reader.startStripe(streams, stripeFooter); + // if we skipped the first row group, move the pointers forward + if (rowInStripe != 0) { + seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride)); + } + } + } + + private StripeInformation beginReadStripe() throws IOException { + StripeInformation stripe = stripes.get(currentStripe); + stripeFooter = readStripeFooter(stripe); + clearStreams(); + // setup the position in the stripe + rowCountInStripe = stripe.getNumberOfRows(); + rowInStripe = 0; + rowBaseInStripe = 0; + for (int i = 0; i < currentStripe; ++i) { + rowBaseInStripe += stripes.get(i).getNumberOfRows(); + } + // reset all of the indexes + for (int i = 0; i < indexes.length; ++i) { + indexes[i] = null; + } + return stripe; + } + + private void readAllDataStreams(StripeInformation stripe) throws IOException { + long start = stripe.getIndexLength(); + long end = start + stripe.getDataLength(); + // explicitly trigger 1 big read + DiskRangeList toRead = new DiskRangeList(start, end); + bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); + List streamDescriptions = stripeFooter.getStreamsList(); + createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams); + } + + /** + * Plan the ranges of the file that we need to read given the list of + * columns and row groups. + * + * @param streamList the list of streams available + * @param indexes the indexes that have been loaded + * @param includedColumns which columns are needed + * @param includedRowGroups which row groups are needed + * @param isCompressed does the file have generic compression + * @param encodings the encodings for each column + * @param types the types of the columns + * @param compressionSize the compression block size + * @return the list of disk ranges that will be loaded + */ + static DiskRangeList planReadPartialDataStreams + (List streamList, + OrcProto.RowIndex[] indexes, + boolean[] includedColumns, + boolean[] includedRowGroups, + boolean isCompressed, + List encodings, + List types, + int compressionSize, + boolean doMergeBuffers) { + long offset = 0; + // figure out which columns have a present stream + boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types); + CreateHelper list = new CreateHelper(); + for (OrcProto.Stream stream : streamList) { + long length = stream.getLength(); + int column = stream.getColumn(); + OrcProto.Stream.Kind streamKind = stream.getKind(); + // since stream kind is optional, first check if it exists + if (stream.hasKind() && + (StreamName.getArea(streamKind) == StreamName.Area.DATA) && + (column < includedColumns.length && includedColumns[column])) { + // if we aren't filtering or it is a dictionary, load it. + if (includedRowGroups == null + || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) { + RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers); + } else { + RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups, + isCompressed, indexes[column], encodings.get(column), types.get(column), + compressionSize, hasNull[column], offset, length, list, doMergeBuffers); + } + } + offset += length; + } + return list.extract(); + } + + void createStreams(List streamDescriptions, + DiskRangeList ranges, + boolean[] includeColumn, + CompressionCodec codec, + int bufferSize, + Map streams) throws IOException { + long streamOffset = 0; + for (OrcProto.Stream streamDesc : streamDescriptions) { + int column = streamDesc.getColumn(); + if ((includeColumn != null && + (column < included.length && !includeColumn[column])) || + streamDesc.hasKind() && + (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) { + streamOffset += streamDesc.getLength(); + continue; + } + List buffers = RecordReaderUtils.getStreamBuffers( + ranges, streamOffset, streamDesc.getLength()); + StreamName name = new StreamName(column, streamDesc.getKind()); + streams.put(name, InStream.create(name.toString(), buffers, + streamDesc.getLength(), codec, bufferSize)); + streamOffset += streamDesc.getLength(); + } + } + + private void readPartialDataStreams(StripeInformation stripe) throws IOException { + List streamList = stripeFooter.getStreamsList(); + DiskRangeList toRead = planReadPartialDataStreams(streamList, + indexes, included, includedRowGroups, codec != null, + stripeFooter.getColumnsList(), types, bufferSize, true); + if (LOG.isDebugEnabled()) { + LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead)); + } + bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); + if (LOG.isDebugEnabled()) { + LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks)); + } + + createStreams(streamList, bufferChunks, included, codec, bufferSize, streams); + } + + /** + * Read the next stripe until we find a row that we don't skip. + * + * @throws IOException + */ + private void advanceStripe() throws IOException { + rowInStripe = rowCountInStripe; + while (rowInStripe >= rowCountInStripe && + currentStripe < stripes.size() - 1) { + currentStripe += 1; + readStripe(); + } + } + + /** + * Skip over rows that we aren't selecting, so that the next row is + * one that we will read. + * + * @param nextRow the row we want to go to + * @throws IOException + */ + private boolean advanceToNextRow( + TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe) + throws IOException { + long nextRowInStripe = nextRow - rowBaseInStripe; + // check for row skipping + if (rowIndexStride != 0 && + includedRowGroups != null && + nextRowInStripe < rowCountInStripe) { + int rowGroup = (int) (nextRowInStripe / rowIndexStride); + if (!includedRowGroups[rowGroup]) { + while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) { + rowGroup += 1; + } + if (rowGroup >= includedRowGroups.length) { + if (canAdvanceStripe) { + advanceStripe(); + } + return canAdvanceStripe; + } + nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride); + } + } + if (nextRowInStripe >= rowCountInStripe) { + if (canAdvanceStripe) { + advanceStripe(); + } + return canAdvanceStripe; + } + if (nextRowInStripe != rowInStripe) { + if (rowIndexStride != 0) { + int rowGroup = (int) (nextRowInStripe / rowIndexStride); + seekToRowEntry(reader, rowGroup); + reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride); + } else { + reader.skipRows(nextRowInStripe - rowInStripe); + } + rowInStripe = nextRowInStripe; + } + return true; + } + + @Override + public boolean nextBatch(VectorizedRowBatch batch) throws IOException { + try { + if (rowInStripe >= rowCountInStripe) { + currentStripe += 1; + if (currentStripe >= stripes.size()) { + batch.size = 0; + return false; + } + readStripe(); + } + + int batchSize = computeBatchSize(batch.getMaxSize()); + + rowInStripe += batchSize; + reader.setVectorColumnCount(batch.getDataColumnCount()); + reader.nextBatch(batch, batchSize); + batch.selectedInUse = false; + batch.size = batchSize; + advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true); + return batch.size != 0; + } catch (IOException e) { + // Rethrow exception with file name in log message + throw new IOException("Error reading file: " + path, e); + } + } + + private int computeBatchSize(long targetBatchSize) { + final int batchSize; + // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row + // groups are selected then marker position is set to the end of range (subset of row groups + // within strip). Batch size computed out of marker position makes sure that batch size is + // aware of row group boundary and will not cause overflow when reading rows + // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287 + if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) { + int startRowGroup = (int) (rowInStripe / rowIndexStride); + if (!includedRowGroups[startRowGroup]) { + while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) { + startRowGroup += 1; + } + } + + int endRowGroup = startRowGroup; + while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) { + endRowGroup += 1; + } + + final long markerPosition = + (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride) + : rowCountInStripe; + batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe)); + + if (isLogDebugEnabled && batchSize < targetBatchSize) { + LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize); + } + } else { + batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe)); + } + return batchSize; + } + + @Override + public void close() throws IOException { + clearStreams(); + dataReader.close(); + } + + @Override + public long getRowNumber() { + return rowInStripe + rowBaseInStripe + firstRow; + } + + /** + * Return the fraction of rows that have been read from the selected. + * section of the file + * + * @return fraction between 0.0 and 1.0 of rows consumed + */ + @Override + public float getProgress() { + return ((float) rowBaseInStripe + rowInStripe) / totalRowCount; + } + + private int findStripe(long rowNumber) { + for (int i = 0; i < stripes.size(); i++) { + StripeInformation stripe = stripes.get(i); + if (stripe.getNumberOfRows() > rowNumber) { + return i; + } + rowNumber -= stripe.getNumberOfRows(); + } + throw new IllegalArgumentException("Seek after the end of reader range"); + } + + public OrcIndex readRowIndex(int stripeIndex, boolean[] included, + boolean[] sargColumns) throws IOException { + return readRowIndex(stripeIndex, included, null, null, sargColumns); + } + + public OrcIndex readRowIndex(int stripeIndex, boolean[] included, + OrcProto.RowIndex[] indexes, + OrcProto.BloomFilterIndex[] bloomFilterIndex, + boolean[] sargColumns) throws IOException { + StripeInformation stripe = stripes.get(stripeIndex); + OrcProto.StripeFooter stripeFooter = null; + // if this is the current stripe, use the cached objects. + if (stripeIndex == currentStripe) { + stripeFooter = this.stripeFooter; + indexes = indexes == null ? this.indexes : indexes; + bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex; + sargColumns = sargColumns == null ? + (sargApp == null ? null : sargApp.sargColumns) : sargColumns; + } + return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns, + bloomFilterIndex); + } + + private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry) + throws IOException { + PositionProvider[] index = new PositionProvider[indexes.length]; + for (int i = 0; i < indexes.length; ++i) { + if (indexes[i] != null) { + index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry)); + } + } + reader.seek(index); + } + + @Override + public void seekToRow(long rowNumber) throws IOException { + if (rowNumber < 0) { + throw new IllegalArgumentException("Seek to a negative row number " + + rowNumber); + } else if (rowNumber < firstRow) { + throw new IllegalArgumentException("Seek before reader range " + + rowNumber); + } + // convert to our internal form (rows from the beginning of slice) + rowNumber -= firstRow; + + // move to the right stripe + int rightStripe = findStripe(rowNumber); + if (rightStripe != currentStripe) { + currentStripe = rightStripe; + readStripe(); + } + readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns); + + // if we aren't to the right row yet, advance in the stripe. + advanceToNextRow(reader, rowNumber, true); + } + + private static final String TRANSLATED_SARG_SEPARATOR = "_"; + public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) { + return rootColumn + TRANSLATED_SARG_SEPARATOR + + ((indexInSourceTable == null) ? -1 : indexInSourceTable); + } + + public static int[] mapTranslatedSargColumns( + List types, List sargLeaves) { + int[] result = new int[sargLeaves.size()]; + OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now. + String lastRootStr = null; + for (int i = 0; i < result.length; ++i) { + String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR); + assert rootAndIndex.length == 2; + String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1]; + int index = Integer.parseInt(indexStr); + // First, check if the column even maps to anything. + if (index == -1) { + result[i] = -1; + continue; + } + assert index >= 0; + // Then, find the root type if needed. + if (!rootStr.equals(lastRootStr)) { + lastRoot = types.get(Integer.parseInt(rootStr)); + lastRootStr = rootStr; + } + // Subtypes of the root types correspond, in order, to the columns in the table schema + // (disregarding schema evolution that doesn't presently work). Get the index for the + // corresponding subtype. + result[i] = lastRoot.getSubtypes(index); + } + return result; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java b/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java similarity index 95% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java rename to orc/src/java/org/apache/orc/impl/RecordReaderUtils.java index 419258807382..1067957c7708 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java +++ b/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import java.io.IOException; import java.nio.ByteBuffer; @@ -33,33 +33,23 @@ import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper; import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper; -import org.apache.hadoop.hive.shims.HadoopShims; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.apache.hadoop.hive.shims.HadoopShims.ByteBufferPoolShim; -import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim; -import org.apache.orc.StripeInformation; -import org.apache.orc.impl.BufferChunk; import org.apache.orc.CompressionCodec; import org.apache.orc.DataReader; -import org.apache.orc.impl.DataReaderProperties; -import org.apache.orc.impl.DirectDecompressionCodec; import org.apache.orc.OrcProto; import com.google.common.collect.ComparisonChain; -import org.apache.orc.impl.InStream; -import org.apache.orc.impl.OrcIndex; -import org.apache.orc.impl.OutStream; +import org.apache.orc.StripeInformation; /** * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl. */ public class RecordReaderUtils { - private static final HadoopShims SHIMS = ShimLoader.getHadoopShims(); + private static final HadoopShims SHIMS = HadoopShims.Factory.get(); private static class DefaultDataReader implements DataReader { private FSDataInputStream file = null; private final ByteBufferAllocatorPool pool; - private ZeroCopyReaderShim zcr = null; + private HadoopShims.ZeroCopyReaderShim zcr = null; private final FileSystem fs; private final Path path; private final boolean useZeroCopy; @@ -69,7 +59,6 @@ private static class DefaultDataReader implements DataReader { private DefaultDataReader(DefaultDataReader other) { this.pool = other.pool; - this.zcr = other.zcr; this.bufferSize = other.bufferSize; this.typeCount = other.typeCount; this.fs = other.fs; @@ -188,12 +177,15 @@ public DiskRangeList readFileData( @Override public void close() throws IOException { - if (file != null) { - file.close(); - } if (pool != null) { pool.clear(); } + // close both zcr and file + try (HadoopShims.ZeroCopyReaderShim myZcr = zcr) { + if (file != null) { + file.close(); + } + } } @Override @@ -402,7 +394,7 @@ public static String stringifyDiskRanges(DiskRangeList range) { * @throws IOException */ static DiskRangeList readDiskRanges(FSDataInputStream file, - ZeroCopyReaderShim zcr, + HadoopShims.ZeroCopyReaderShim zcr, long base, DiskRangeList range, boolean doForceDirect) throws IOException { @@ -494,19 +486,19 @@ static List getStreamBuffers(DiskRangeList range, long offset, long l return buffers; } - static ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file, + static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file, CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException { if ((codec == null || ((codec instanceof DirectDecompressionCodec) && ((DirectDecompressionCodec) codec).isAvailable()))) { /* codec is null or is available */ - return ShimLoader.getHadoopShims().getZeroCopyReader(file, pool); + return SHIMS.getZeroCopyReader(file, pool); } return null; } // this is an implementation copied from ElasticByteBufferPool in hadoop-2, // which lacks a clear()/clean() operation - public final static class ByteBufferAllocatorPool implements ByteBufferPoolShim { + public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim { private static final class Key implements Comparable { private final int capacity; private final long insertionGeneration; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java similarity index 99% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java rename to orc/src/java/org/apache/orc/impl/SchemaEvolution.java index 046665b04783..2c80aaa8c563 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java +++ b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import java.io.IOException; import java.util.ArrayList; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java similarity index 84% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java rename to orc/src/java/org/apache/orc/impl/TreeReaderFactory.java index 6d1c25634b39..6c8ecfdb5137 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java +++ b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -15,18 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import java.io.EOFException; import java.io.IOException; import java.math.BigInteger; -import java.sql.Timestamp; import java.text.ParseException; import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.TimeZone; @@ -44,34 +40,8 @@ import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.serde2.io.ByteWritable; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.HiveCharWritable; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.shims.HadoopShims.TextReaderShim; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; import org.apache.orc.TypeDescription; -import org.apache.orc.impl.BitFieldReader; -import org.apache.orc.impl.DynamicByteArray; -import org.apache.orc.impl.InStream; -import org.apache.orc.impl.IntegerReader; import org.apache.orc.OrcProto; -import org.apache.orc.impl.PositionProvider; -import org.apache.orc.impl.RunLengthByteReader; -import org.apache.orc.impl.RunLengthIntegerReader; -import org.apache.orc.impl.RunLengthIntegerReaderV2; -import org.apache.orc.impl.SerializationUtils; -import org.apache.orc.impl.StreamName; /** * Factory for creating ORC tree readers. @@ -171,19 +141,6 @@ protected long countNonNulls(long rows) throws IOException { abstract void skipRows(long rows) throws IOException; - void readValuePresent() throws IOException { - if (present != null) { - valuePresent = present.next() == 1; - } - } - - Object next(Object previous) throws IOException { - if (present != null) { - valuePresent = present.next() == 1; - } - return previous; - } - /** * Called at the top level to read into the given batch. * @param batch the batch to read into @@ -274,12 +231,7 @@ public void seek(PositionProvider[] position) { } @Override - Object next(Object previous) { - return null; - } - - @Override - public void nextVector(ColumnVector vector, boolean[] isNull, final int batchSize) { + public void nextVector(ColumnVector vector, boolean[] isNull, int size) { vector.noNulls = false; vector.isNull[0] = true; vector.isRepeating = true; @@ -325,21 +277,6 @@ void skipRows(long items) throws IOException { reader.skip(countNonNulls(items)); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - BooleanWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new BooleanWritable(); - } else { - result = (BooleanWritable) previous; - } - result.set(reader.next() == 1); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -386,21 +323,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - ByteWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new ByteWritable(); - } else { - result = (ByteWritable) previous; - } - result.set(reader.next()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -468,21 +390,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - ShortWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new ShortWritable(); - } else { - result = (ShortWritable) previous; - } - result.set((short) reader.next()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -550,21 +457,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - IntWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new IntWritable(); - } else { - result = (IntWritable) previous; - } - result.set((int) reader.next()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -633,21 +525,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - LongWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new LongWritable(); - } else { - result = (LongWritable) previous; - } - result.set(reader.next()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -702,21 +579,6 @@ public void seek(PositionProvider index) throws IOException { stream.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - FloatWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new FloatWritable(); - } else { - result = (FloatWritable) previous; - } - result.set(utils.readFloat(stream)); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -811,21 +673,6 @@ public void seek(PositionProvider index) throws IOException { stream.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - DoubleWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new DoubleWritable(); - } else { - result = (DoubleWritable) previous; - } - result.set(utils.readDouble(stream)); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -938,31 +785,6 @@ public void seek(PositionProvider index) throws IOException { lengths.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - BytesWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new BytesWritable(); - } else { - result = (BytesWritable) previous; - } - int len = (int) lengths.next(); - result.setSize(len); - int offset = 0; - while (len > 0) { - int written = stream.read(result.getBytes(), offset, len); - if (written < 0) { - throw new EOFException("Can't finish byte read from " + stream); - } - len -= written; - offset += written; - } - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1086,48 +908,6 @@ public void seek(PositionProvider index) throws IOException { nanos.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - TimestampWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new TimestampWritable(); - } else { - result = (TimestampWritable) previous; - } - long millis = (data.next() + base_timestamp) * WriterImpl.MILLIS_PER_SECOND; - int newNanos = parseNanos(nanos.next()); - // fix the rounding when we divided by 1000. - if (millis >= 0) { - millis += newNanos / WriterImpl.NANOS_PER_MILLI; - } else { - millis -= newNanos / WriterImpl.NANOS_PER_MILLI; - } - long offset = 0; - // If reader and writer time zones have different rules, adjust the timezone difference - // between reader and writer taking day light savings into account. - if (!hasSameTZRules) { - offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis); - } - long adjustedMillis = millis + offset; - Timestamp ts = new Timestamp(adjustedMillis); - // Sometimes the reader timezone might have changed after adding the adjustedMillis. - // To account for that change, check for any difference in reader timezone after - // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time). - if (!hasSameTZRules && - (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) { - long newOffset = - writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis); - adjustedMillis = millis + newOffset; - ts.setTime(adjustedMillis); - } - ts.setNanos(newNanos); - result.set(ts); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1236,21 +1016,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - DateWritable result = null; - if (valuePresent) { - if (previous == null) { - result = new DateWritable(); - } else { - result = (DateWritable) previous; - } - result.set((int) reader.next()); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1328,36 +1093,18 @@ public void seek(PositionProvider index) throws IOException { scaleReader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - final HiveDecimalWritable result; - if (valuePresent) { - if (previous == null) { - result = new HiveDecimalWritable(); - } else { - result = (HiveDecimalWritable) previous; - } - result.set(HiveDecimal.create(SerializationUtils.readBigInteger - (valueStream), (int) scaleReader.next())); - return HiveDecimalWritable.enforcePrecisionScale(result, precision, - scale); - } - return null; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, final int batchSize) throws IOException { final DecimalColumnVector result = (DecimalColumnVector) previousVector; - // Read present/isNull stream super.nextVector(result, isNull, batchSize); if (batchSize > scratchScaleVector.length) { scratchScaleVector = new int[(int) batchSize]; } + // read the scales scaleReader.nextVector(result, scratchScaleVector, batchSize); // Read value entries based on isNull entries if (result.noNulls) { @@ -1458,11 +1205,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - return reader.next(previous); - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1557,8 +1299,9 @@ public static void readOrcByteArrays(InStream stream, * stripe. */ public static class StringDirectTreeReader extends TreeReader { + private static final HadoopShims SHIMS = HadoopShims.Factory.get(); protected InStream stream; - protected TextReaderShim data; + protected HadoopShims.TextReaderShim data; protected IntegerReader lengths; private final LongColumnVector scratchlcv; @@ -1573,7 +1316,7 @@ protected StringDirectTreeReader(int columnId, InStream present, InStream data, this.stream = data; if (length != null && encoding != null) { this.lengths = createIntegerReader(encoding, length, false, false); - this.data = ShimLoader.getHadoopShims().getTextReaderShim(this.stream); + this.data = SHIMS.getTextReaderShim(this.stream); } } @@ -1594,7 +1337,7 @@ void startStripe(Map streams, StreamName name = new StreamName(columnId, OrcProto.Stream.Kind.DATA); stream = streams.get(name); - data = ShimLoader.getHadoopShims().getTextReaderShim(this.stream); + data = SHIMS.getTextReaderShim(this.stream); lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false); @@ -1613,22 +1356,6 @@ public void seek(PositionProvider index) throws IOException { lengths.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - Text result = null; - if (valuePresent) { - if (previous == null) { - result = new Text(); - } else { - result = (Text) previous; - } - int len = (int) lengths.next(); - data.read(result, len); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1776,31 +1503,6 @@ public void seek(PositionProvider index) throws IOException { reader.seek(index); } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - Text result = null; - if (valuePresent) { - int entry = (int) reader.next(); - if (previous == null) { - result = new Text(); - } else { - result = (Text) previous; - } - int offset = dictionaryOffsets[entry]; - int length = getDictionaryEntryLength(entry, offset); - // If the column is just empty strings, the size will be zero, - // so the buffer will be null, in that case just return result - // as it will default to empty - if (dictionaryBuffer != null) { - dictionaryBuffer.setText(result, offset, length); - } else { - result.clear(); - } - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1899,25 +1601,6 @@ protected CharTreeReader(int columnId, int maxLength, InStream present, InStream this.maxLength = maxLength; } - @Override - Object next(Object previous) throws IOException { - final HiveCharWritable result; - if (previous == null) { - result = new HiveCharWritable(); - } else { - result = (HiveCharWritable) previous; - } - // Use the string reader implementation to populate the internal Text value - Object textVal = super.next(result.getTextValue()); - if (textVal == null) { - return null; - } - // result should now hold the value that was read in. - // enforce char length - result.enforceMaxLength(maxLength); - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -1974,25 +1657,6 @@ protected VarcharTreeReader(int columnId, int maxLength, InStream present, InStr this.maxLength = maxLength; } - @Override - Object next(Object previous) throws IOException { - final HiveVarcharWritable result; - if (previous == null) { - result = new HiveVarcharWritable(); - } else { - result = (HiveVarcharWritable) previous; - } - // Use the string reader implementation to populate the internal Text value - Object textVal = super.next(result.getTextValue()); - if (textVal == null) { - return null; - } - // result should now hold the value that was read in. - // enforce varchar length - result.enforceMaxLength(maxLength); - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -2045,8 +1709,6 @@ protected StructTreeReader(int columnId, boolean skipCorrupt) throws IOException { super(columnId); - TypeDescription fileSchema = evolution.getFileType(readerSchema); - List childrenTypes = readerSchema.getChildren(); this.fields = new TreeReader[childrenTypes.size()]; for (int i = 0; i < fields.length; ++i) { @@ -2065,30 +1727,6 @@ void seek(PositionProvider[] index) throws IOException { } } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - OrcStruct result = null; - if (valuePresent) { - if (previous == null) { - result = new OrcStruct(fields.length); - } else { - result = (OrcStruct) previous; - - // If the input format was initialized with a file with a - // different number of fields, the number of fields needs to - // be updated to the correct number - result.setNumFields(fields.length); - } - for (int i = 0; i < fields.length; ++i) { - if (fields[i] != null) { - result.setFieldValue(i, fields[i].next(result.getFieldValue(i))); - } - } - } - return result; - } - @Override public void nextBatch(VectorizedRowBatch batch, int batchSize) throws IOException { @@ -2170,24 +1808,6 @@ void seek(PositionProvider[] index) throws IOException { } } - @Override - Object next(Object previous) throws IOException { - super.next(previous); - OrcUnion result = null; - if (valuePresent) { - if (previous == null) { - result = new OrcUnion(); - } else { - result = (OrcUnion) previous; - } - byte tag = tags.next(); - Object previousVal = result.getObject(); - result.set(tag, fields[tag].next(tag == result.getTag() ? - previousVal : null)); - } - return result; - } - @Override public void nextVector(ColumnVector previousVector, boolean[] isNull, @@ -2259,36 +1879,6 @@ void seek(PositionProvider[] index) throws IOException { elementReader.seek(index); } - @Override - @SuppressWarnings("unchecked") - Object next(Object previous) throws IOException { - super.next(previous); - List result = null; - if (valuePresent) { - if (previous == null) { - result = new ArrayList<>(); - } else { - result = (ArrayList) previous; - } - int prevLength = result.size(); - int length = (int) lengths.next(); - // extend the list to the new length - for (int i = prevLength; i < length; ++i) { - result.add(null); - } - // read the new elements into the array - for (int i = 0; i < length; i++) { - result.set(i, elementReader.next(i < prevLength ? - result.get(i) : null)); - } - // remove any extra elements - for (int i = prevLength - 1; i >= length; --i) { - result.remove(i); - } - } - return result; - } - @Override public void nextVector(ColumnVector previous, boolean[] isNull, @@ -2371,28 +1961,6 @@ void seek(PositionProvider[] index) throws IOException { valueReader.seek(index); } - @Override - @SuppressWarnings("unchecked") - Object next(Object previous) throws IOException { - super.next(previous); - Map result = null; - if (valuePresent) { - if (previous == null) { - result = new LinkedHashMap<>(); - } else { - result = (LinkedHashMap) previous; - } - // for now just clear and create new objects - result.clear(); - int length = (int) lengths.next(); - // read the new elements into the array - for (int i = 0; i < length; i++) { - result.put(keyReader.next(null), valueReader.next(null)); - } - } - return result; - } - @Override public void nextVector(ColumnVector previous, boolean[] isNull, diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java b/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java similarity index 77% rename from shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java rename to orc/src/java/org/apache/orc/impl/ZeroCopyShims.java index 6ef0467a8e53..de02c8b2788b 100644 --- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java +++ b/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.shims; +package org.apache.orc.impl; import java.io.IOException; import java.nio.ByteBuffer; @@ -25,14 +25,11 @@ import org.apache.hadoop.fs.ReadOption; import org.apache.hadoop.io.ByteBufferPool; -import org.apache.hadoop.hive.shims.HadoopShims.ByteBufferPoolShim; -import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim; - class ZeroCopyShims { private static final class ByteBufferPoolAdapter implements ByteBufferPool { - private ByteBufferPoolShim pool; + private HadoopShims.ByteBufferPoolShim pool; - public ByteBufferPoolAdapter(ByteBufferPoolShim pool) { + public ByteBufferPoolAdapter(HadoopShims.ByteBufferPoolShim pool) { this.pool = pool; } @@ -47,7 +44,7 @@ public final void putBuffer(ByteBuffer buffer) { } } - private static final class ZeroCopyAdapter implements ZeroCopyReaderShim { + private static final class ZeroCopyAdapter implements HadoopShims.ZeroCopyReaderShim { private final FSDataInputStream in; private final ByteBufferPoolAdapter pool; private final static EnumSet CHECK_SUM = EnumSet @@ -55,7 +52,8 @@ private static final class ZeroCopyAdapter implements ZeroCopyReaderShim { private final static EnumSet NO_CHECK_SUM = EnumSet .of(ReadOption.SKIP_CHECKSUMS); - public ZeroCopyAdapter(FSDataInputStream in, ByteBufferPoolShim poolshim) { + public ZeroCopyAdapter(FSDataInputStream in, + HadoopShims.ByteBufferPoolShim poolshim) { this.in = in; if (poolshim != null) { pool = new ByteBufferPoolAdapter(poolshim); @@ -76,10 +74,15 @@ public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) public final void releaseBuffer(ByteBuffer buffer) { this.in.releaseBuffer(buffer); } + + @Override + public final void close() throws IOException { + this.in.close(); + } } - public static ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, - ByteBufferPoolShim pool) throws IOException { + public static HadoopShims.ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, + HadoopShims.ByteBufferPoolShim pool) throws IOException { return new ZeroCopyAdapter(in, pool); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/orc/src/java/org/apache/orc/tools/FileDump.java similarity index 82% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java rename to orc/src/java/org/apache/orc/tools/FileDump.java index 9c2f88f88500..e32027faf2b8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ b/orc/src/java/org/apache/orc/tools/FileDump.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.tools; import java.io.IOException; import java.io.OutputStreamWriter; @@ -24,7 +24,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.Map; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.GnuParser; @@ -39,22 +38,34 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hdfs.DistributedFileSystem; -import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.orc.BloomFilterIO; -import org.apache.hadoop.hive.serde2.io.ByteWritable; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; import org.apache.orc.ColumnStatistics; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.ColumnStatisticsImpl; +import org.apache.orc.impl.OrcAcidUtils; import org.apache.orc.impl.OrcIndex; import org.apache.orc.OrcProto; import org.apache.orc.StripeInformation; import org.apache.orc.StripeStatistics; +import org.apache.orc.impl.RecordReaderImpl; import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONWriter; @@ -74,7 +85,7 @@ public final class FileDump { public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith( - AcidUtils.DELTA_SIDE_FILE_SUFFIX); + OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX); } }; @@ -171,7 +182,7 @@ static Reader getReader(final Path path, final Configuration conf, FileSystem fs = path.getFileSystem(conf); long dataFileLen = fs.getFileStatus(path).getLen(); System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]"); - Path sideFile = OrcRecordUpdater.getSideFile(path); + Path sideFile = OrcAcidUtils.getSideFile(path); final boolean sideFileExists = fs.exists(sideFile); boolean openDataFile = false; boolean openSideFile = false; @@ -198,7 +209,7 @@ static Reader getReader(final Path path, final Configuration conf, Reader reader = null; if (sideFileExists) { - final long maxLen = OrcRawRecordMerger.getLastFlushLength(fs, path); + final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path); final long sideFileLen = fs.getFileStatus(sideFile).getLen(); System.err.println("Found flush length file " + sideFile + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]"); @@ -320,11 +331,11 @@ private static void printMetaDataImpl(final String filename, " with " + reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); System.out.println("Rows: " + reader.getNumberOfRows()); - System.out.println("Compression: " + reader.getCompression()); - if (reader.getCompression() != CompressionKind.NONE) { + System.out.println("Compression: " + reader.getCompressionKind()); + if (reader.getCompressionKind() != CompressionKind.NONE) { System.out.println("Compression size: " + reader.getCompressionSize()); } - System.out.println("Type: " + reader.getObjectInspector().getTypeName()); + System.out.println("Type: " + reader.getSchema().toString()); System.out.println("\nStripe Statistics:"); List stripeStats = reader.getStripeStatistics(); for (int n = 0; n < stripeStats.size(); n++) { @@ -408,7 +419,7 @@ private static void printMetaDataImpl(final String filename, System.out.println("\nFile length: " + fileLen + " bytes"); System.out.println("Padding length: " + paddedBytes + " bytes"); System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); - OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader); + AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); if (acidStats != null) { System.out.println("ACID stats:" + acidStats); } @@ -541,7 +552,7 @@ private static void recoverFile(final Path corruptPath, final FileSystem fs, moveFiles(fs, corruptPath, backupDataPath); // Move side file to backup path - Path sideFilePath = OrcRecordUpdater.getSideFile(corruptPath); + Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath); Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName()); moveFiles(fs, sideFilePath, backupSideFilePath); @@ -758,123 +769,162 @@ static Options createOptions() { } private static void printMap(JSONWriter writer, - Map obj, - List types, - OrcProto.Type type - ) throws IOException, JSONException { + MapColumnVector vector, + TypeDescription schema, + int row) throws JSONException { writer.array(); - int keyType = type.getSubtypes(0); - int valueType = type.getSubtypes(1); - for (Map.Entry item : obj.entrySet()) { + TypeDescription keyType = schema.getChildren().get(0); + TypeDescription valueType = schema.getChildren().get(1); + int offset = (int) vector.offsets[row]; + for (int i = 0; i < vector.lengths[row]; ++i) { writer.object(); writer.key("_key"); - printObject(writer, item.getKey(), types, keyType); + printValue(writer, vector.keys, keyType, offset + i); writer.key("_value"); - printObject(writer, item.getValue(), types, valueType); + printValue(writer, vector.values, valueType, offset + i); writer.endObject(); } writer.endArray(); } private static void printList(JSONWriter writer, - List obj, - List types, - OrcProto.Type type - ) throws IOException, JSONException { - int subtype = type.getSubtypes(0); + ListColumnVector vector, + TypeDescription schema, + int row) throws JSONException { writer.array(); - for (Object item : obj) { - printObject(writer, item, types, subtype); + int offset = (int) vector.offsets[row]; + TypeDescription childType = schema.getChildren().get(0); + for (int i = 0; i < vector.lengths[row]; ++i) { + printValue(writer, vector.child, childType, offset + i); } writer.endArray(); } private static void printUnion(JSONWriter writer, - OrcUnion obj, - List types, - OrcProto.Type type - ) throws IOException, JSONException { - int subtype = type.getSubtypes(obj.getTag()); - printObject(writer, obj.getObject(), types, subtype); + UnionColumnVector vector, + TypeDescription schema, + int row) throws JSONException { + int tag = vector.tags[row]; + printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row); } static void printStruct(JSONWriter writer, - OrcStruct obj, - List types, - OrcProto.Type type) throws IOException, JSONException { + StructColumnVector batch, + TypeDescription schema, + int row) throws JSONException { writer.object(); - List fieldTypes = type.getSubtypesList(); + List fieldNames = schema.getFieldNames(); + List fieldTypes = schema.getChildren(); for (int i = 0; i < fieldTypes.size(); ++i) { - writer.key(type.getFieldNames(i)); - printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i)); + writer.key(fieldNames.get(i)); + printValue(writer, batch.fields[i], fieldTypes.get(i), row); } writer.endObject(); } - static void printObject(JSONWriter writer, - Object obj, - List types, - int typeId) throws IOException, JSONException { - OrcProto.Type type = types.get(typeId); - if (obj == null) { - writer.value(null); - } else { - switch (type.getKind()) { - case STRUCT: - printStruct(writer, (OrcStruct) obj, types, type); + static void printBinary(JSONWriter writer, BytesColumnVector vector, + int row) throws JSONException { + writer.array(); + int offset = vector.start[row]; + for(int i=0; i < vector.length[row]; ++i) { + writer.value(0xff & (int) vector.vector[row][offset + i]); + } + writer.endArray(); + } + static void printValue(JSONWriter writer, ColumnVector vector, + TypeDescription schema, int row) throws JSONException { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + switch (schema.getCategory()) { + case BOOLEAN: + writer.value(((LongColumnVector) vector).vector[row] != 0); break; - case UNION: - printUnion(writer, (OrcUnion) obj, types, type); + case BYTE: + case SHORT: + case INT: + case LONG: + writer.value(((LongColumnVector) vector).vector[row]); break; - case LIST: - printList(writer, (List) obj, types, type); + case FLOAT: + case DOUBLE: + writer.value(((DoubleColumnVector) vector).vector[row]); break; - case MAP: - printMap(writer, (Map) obj, types, type); + case STRING: + case CHAR: + case VARCHAR: + writer.value(((BytesColumnVector) vector).toString(row)); break; - case BYTE: - writer.value(((ByteWritable) obj).get()); + case BINARY: + printBinary(writer, (BytesColumnVector) vector, row); break; - case SHORT: - writer.value(((ShortWritable) obj).get()); + case DECIMAL: + writer.value(((DecimalColumnVector) vector).vector[row].toString()); break; - case INT: - writer.value(((IntWritable) obj).get()); + case DATE: + writer.value(new DateWritable( + (int) ((LongColumnVector) vector).vector[row]).toString()); break; - case LONG: - writer.value(((LongWritable) obj).get()); + case TIMESTAMP: + writer.value(((TimestampColumnVector) vector) + .asScratchTimestamp(row).toString()); break; - case FLOAT: - writer.value(((FloatWritable) obj).get()); + case LIST: + printList(writer, (ListColumnVector) vector, schema, row); break; - case DOUBLE: - writer.value(((DoubleWritable) obj).get()); + case MAP: + printMap(writer, (MapColumnVector) vector, schema, row); break; - case BOOLEAN: - writer.value(((BooleanWritable) obj).get()); + case STRUCT: + printStruct(writer, (StructColumnVector) vector, schema, row); break; - default: - writer.value(obj.toString()); + case UNION: + printUnion(writer, (UnionColumnVector) vector, schema, row); break; + default: + throw new IllegalArgumentException("Unknown type " + + schema.toString()); } + } else { + writer.value(null); + } + } + + static void printRow(JSONWriter writer, + VectorizedRowBatch batch, + TypeDescription schema, + int row) throws JSONException { + if (schema.getCategory() == TypeDescription.Category.STRUCT) { + List fieldTypes = schema.getChildren(); + List fieldNames = schema.getFieldNames(); + writer.object(); + for (int c = 0; c < batch.cols.length; ++c) { + writer.key(fieldNames.get(c)); + printValue(writer, batch.cols[c], fieldTypes.get(c), row); + } + writer.endObject(); + } else { + printValue(writer, batch.cols[0], schema, row); } } static void printJsonData(final Reader reader) throws IOException, JSONException { PrintStream printStream = System.out; OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8"); - RecordReader rows = reader.rows(null); - Object row = null; + RecordReader rows = reader.rows(); try { - List types = reader.getTypes(); - while (rows.hasNext()) { - row = rows.next(row); - JSONWriter writer = new JSONWriter(out); - printObject(writer, row, types, 0); - out.write("\n"); - out.flush(); - if (printStream.checkError()) { - throw new IOException("Error encountered when writing to stdout."); + TypeDescription schema = reader.getSchema(); + VectorizedRowBatch batch = schema.createRowBatch(); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + JSONWriter writer = new JSONWriter(out); + printRow(writer, batch, schema, r); + out.write("\n"); + out.flush(); + if (printStream.checkError()) { + throw new IOException("Error encountered when writing to stdout."); + } } } } finally { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/orc/src/java/org/apache/orc/tools/JsonFileDump.java similarity index 96% rename from ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java rename to orc/src/java/org/apache/orc/tools/JsonFileDump.java index 00de5451d277..75153a228de2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java +++ b/orc/src/java/org/apache/orc/tools/JsonFileDump.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.tools; import java.io.IOException; import java.util.List; @@ -24,6 +24,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.orc.CompressionKind; +import org.apache.orc.Reader; +import org.apache.orc.impl.AcidStats; +import org.apache.orc.impl.OrcAcidUtils; +import org.apache.orc.impl.RecordReaderImpl; import org.codehaus.jettison.json.JSONArray; import org.apache.orc.BloomFilterIO; import org.apache.orc.BinaryColumnStatistics; @@ -80,11 +85,11 @@ public static void printJsonMetaData(List files, writer.key("writerVersion").value(reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); writer.key("numberOfRows").value(reader.getNumberOfRows()); - writer.key("compression").value(reader.getCompression()); - if (reader.getCompression() != CompressionKind.NONE) { + writer.key("compression").value(reader.getCompressionKind()); + if (reader.getCompressionKind() != CompressionKind.NONE) { writer.key("compressionBufferSize").value(reader.getCompressionSize()); } - writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); + writer.key("schemaString").value(reader.getSchema().toString()); writer.key("schema").array(); writeSchema(writer, reader.getTypes()); writer.endArray(); @@ -191,7 +196,7 @@ public static void printJsonMetaData(List files, writer.key("fileLength").value(fileLen); writer.key("paddingLength").value(paddedBytes); writer.key("paddingRatio").value(percentPadding); - OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader); + AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); if (acidStats != null) { writer.key("numInserts").value(acidStats.inserts); writer.key("numDeletes").value(acidStats.deletes); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java b/orc/src/test/org/apache/orc/TestColumnStatistics.java similarity index 85% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java rename to orc/src/test/org/apache/orc/TestColumnStatistics.java index 5f0146fccd9c..1837dbb433b3 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java +++ b/orc/src/test/org/apache/orc/TestColumnStatistics.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static junit.framework.Assert.assertEquals; import static org.junit.Assume.assumeTrue; @@ -31,21 +31,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.orc.ColumnStatistics; import org.apache.orc.impl.ColumnStatisticsImpl; -import org.apache.orc.DateColumnStatistics; -import org.apache.orc.DecimalColumnStatistics; -import org.apache.orc.DoubleColumnStatistics; -import org.apache.orc.IntegerColumnStatistics; -import org.apache.orc.StringColumnStatistics; -import org.apache.orc.StripeStatistics; -import org.apache.orc.TimestampColumnStatistics; -import org.apache.orc.TypeDescription; +import org.apache.orc.tools.FileDump; +import org.apache.orc.tools.TestFileDump; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -194,20 +187,6 @@ public void testDecimalMerge() throws Exception { } - public static class SimpleStruct { - BytesWritable bytes1; - Text string1; - - SimpleStruct(BytesWritable b1, String s1) { - this.bytes1 = b1; - if (s1 == null) { - this.string1 = null; - } else { - this.string1 = new Text(s1); - } - } - } - Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); @@ -236,53 +215,86 @@ private static BytesWritable bytes(int... items) { return result; } + void appendRow(VectorizedRowBatch batch, BytesWritable bytes, + String str) { + int row = batch.size++; + if (bytes == null) { + batch.cols[0].noNulls = false; + batch.cols[0].isNull[row] = true; + } else { + ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(), + 0, bytes.getLength()); + } + if (str == null) { + batch.cols[1].noNulls = false; + batch.cols[1].isNull[row] = true; + } else { + ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes()); + } + } + @Test public void testHasNull() throws Exception { - - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = + TypeDescription.createStruct() + .addField("bytes1", TypeDescription.createBinary()) + .addField("string1", TypeDescription.createString()); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(1000) .stripeSize(10000) .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(5000); // STRIPE 1 // RG1 for(int i=0; i<1000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), "RG1")); + appendRow(batch, bytes(1, 2, 3), "RG1"); } + writer.addRowBatch(batch); + batch.reset(); // RG2 for(int i=0; i<1000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), null)); + appendRow(batch, bytes(1, 2, 3), null); } + writer.addRowBatch(batch); + batch.reset(); // RG3 for(int i=0; i<1000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), "RG3")); + appendRow(batch, bytes(1, 2, 3), "RG3"); } + writer.addRowBatch(batch); + batch.reset(); // RG4 - for(int i=0; i<1000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), null)); + for (int i = 0; i < 1000; i++) { + appendRow(batch, bytes(1,2,3), null); } + writer.addRowBatch(batch); + batch.reset(); // RG5 for(int i=0; i<1000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), null)); + appendRow(batch, bytes(1, 2, 3), null); } + writer.addRowBatch(batch); + batch.reset(); // STRIPE 2 - for(int i=0; i<5000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), null)); + for (int i = 0; i < 5000; i++) { + appendRow(batch, bytes(1,2,3), null); } + writer.addRowBatch(batch); + batch.reset(); // STRIPE 3 - for(int i=0; i<5000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), "STRIPE-3")); + for (int i = 0; i < 5000; i++) { + appendRow(batch, bytes(1,2,3), "STRIPE-3"); } + writer.addRowBatch(batch); + batch.reset(); // STRIPE 4 - for(int i=0; i<5000; i++) { - writer.addRow(new SimpleStruct(bytes(1,2,3), null)); + for (int i = 0; i < 5000; i++) { + appendRow(batch, bytes(1,2,3), null); } + writer.addRowBatch(batch); + batch.reset(); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java b/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java similarity index 51% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java rename to orc/src/test/org/apache/orc/TestNewIntegerEncoding.java index f41a7ba8a9c3..526dd81dfc2c 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java +++ b/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static junit.framework.Assert.assertEquals; @@ -29,12 +29,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.orc.CompressionKind; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -70,14 +67,23 @@ public TSRow(Timestamp ts) { } } - public static class Row { - Integer int1; - Long long1; + public static TypeDescription getRowSchema() { + return TypeDescription.createStruct() + .addField("int1", TypeDescription.createInt()) + .addField("long1", TypeDescription.createLong()); + } - public Row(int val, long l) { - this.int1 = val; - this.long1 = l; - } + public static void appendRow(VectorizedRowBatch batch, + int int1, long long1) { + int row = batch.size++; + ((LongColumnVector) batch.cols[0]).vector[row] = int1; + ((LongColumnVector) batch.cols[1]).vector[row] = long1; + } + + public static void appendLong(VectorizedRowBatch batch, + long long1) { + int row = batch.size++; + ((LongColumnVector) batch.cols[0]).vector[row] = long1; } Path workDir = new Path(System.getProperty("test.tmp.dir", "target" @@ -101,42 +107,36 @@ public void openFileSystem() throws Exception { @Test public void testBasicRow() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Row.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - + TypeDescription schema= getRowSchema(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000) .encodingStrategy(encodingStrategy)); - writer.addRow(new Row(111, 1111L)); - writer.addRow(new Row(111, 1111L)); - writer.addRow(new Row(111, 1111L)); + VectorizedRowBatch batch = schema.createRowBatch(); + appendRow(batch, 111, 1111L); + appendRow(batch, 111, 1111L); + appendRow(batch, 111, 1111L); + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new IntWritable(111), ((OrcStruct) row).getFieldValue(0)); - assertEquals(new LongWritable(1111), ((OrcStruct) row).getFieldValue(1)); + batch = reader.getSchema().createRowBatch(); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(111, ((LongColumnVector) batch.cols[0]).vector[r]); + assertEquals(1111, ((LongColumnVector) batch.cols[1]).vector[r]); + } } } @Test public void testBasicOld() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, @@ -145,33 +145,34 @@ public void testBasicOld() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .compress(CompressionKind.NONE) .version(OrcFile.Version.V_0_11) .bufferSize(10000) .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + batch = reader.getSchema().createRowBatch(); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testBasicNew() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, @@ -181,167 +182,171 @@ public void testBasicNew() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + batch = reader.getSchema().createRowBatch(); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testBasicDelta1() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { -500, -400, -350, -325, -310 }; List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testBasicDelta2() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { -500, -600, -650, -675, -710 }; List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testBasicDelta3() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 500, 400, 350, 325, 310 }; List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testBasicDelta4() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 500, 600, 650, 675, 710 }; List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testDeltaOverflow() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory - .getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[]{4513343538618202719l, 4513343538618202711l, 2911390882471569739l, @@ -350,31 +355,31 @@ public void testDeltaOverflow() throws Exception { Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testDeltaOverflow2() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory - .getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l, 2911390882471569739l, @@ -383,31 +388,31 @@ public void testDeltaOverflow2() throws Exception { Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testDeltaOverflow3() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory - .getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2, Long.MAX_VALUE}; @@ -415,161 +420,166 @@ public void testDeltaOverflow3() throws Exception { Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testIntegerMin() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); input.add((long) Integer.MIN_VALUE); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testIntegerMax() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); input.add((long) Integer.MAX_VALUE); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testLongMin() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); input.add(Long.MIN_VALUE); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testLongMax() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); input.add(Long.MAX_VALUE); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testRandomInt() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -578,34 +588,35 @@ public void testRandomInt() throws Exception { } Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(100000); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testRandomLong() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -614,34 +625,35 @@ public void testRandomLong() throws Exception { } Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(100000); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseNegativeMin() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1, @@ -658,34 +670,35 @@ public void testPatchedBaseNegativeMin() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseNegativeMin2() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1, @@ -702,34 +715,35 @@ public void testPatchedBaseNegativeMin2() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseNegativeMin3() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1, @@ -746,34 +760,35 @@ public void testPatchedBaseNegativeMin3() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseNegativeMin4() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { 13, 13, 11, 8, 13, 10, 10, 11, 11, 14, 11, 7, 13, 12, 12, 11, 15, 12, 12, 9, 8, 10, 13, 11, 8, 6, 5, 6, 11, 7, 15, 10, 7, @@ -781,34 +796,35 @@ public void testPatchedBaseNegativeMin4() throws Exception { List input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseAt0() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -818,34 +834,35 @@ public void testPatchedBaseAt0() throws Exception { input.set(0, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseAt1() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -855,34 +872,34 @@ public void testPatchedBaseAt1() throws Exception { input.set(1, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .compress(CompressionKind.NONE) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseAt255() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -892,33 +909,34 @@ public void testPatchedBaseAt255() throws Exception { input.set(255, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseAt256() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -928,33 +946,34 @@ public void testPatchedBaseAt256() throws Exception { input.set(256, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBase510() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -964,33 +983,34 @@ public void testPatchedBase510() throws Exception { input.set(510, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBase511() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -1000,33 +1020,34 @@ public void testPatchedBase511() throws Exception { input.set(511, 20000L); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseMax1() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -1037,32 +1058,33 @@ public void testPatchedBaseMax1() throws Exception { Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseMax2() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -1075,32 +1097,33 @@ public void testPatchedBaseMax2() throws Exception { Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(5120); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseMax3() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); input.add(371946367L); @@ -1126,32 +1149,32 @@ public void testPatchedBaseMax3() throws Exception { Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseMax4() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); for (int i = 0; i < 25; i++) { @@ -1180,39 +1203,42 @@ public void testPatchedBaseMax4() throws Exception { Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } @Test public void testPatchedBaseTimestamp() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(TSRow.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createStruct() + .addField("ts", TypeDescription.createTimestamp()); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); List tslist = Lists.newArrayList(); tslist.add(Timestamp.valueOf("2099-01-01 00:00:00")); @@ -1248,68 +1274,68 @@ public void testPatchedBaseTimestamp() throws Exception { tslist.add(Timestamp.valueOf("2002-01-01 00:00:00")); tslist.add(Timestamp.valueOf("2005-01-01 00:00:00")); tslist.add(Timestamp.valueOf("1974-01-01 00:00:00")); - + int idx = 0; for (Timestamp ts : tslist) { - writer.addRow(new TSRow(ts)); + ((TimestampColumnVector) batch.cols[0]).set(idx, ts); } - + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); - int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(tslist.get(idx++).getNanos(), - ((TimestampWritable) ((OrcStruct) row).getFieldValue(0)).getNanos()); + batch = reader.getSchema().createRowBatch(); + idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(tslist.get(idx++), + ((TimestampColumnVector) batch.cols[0]).asScratchTimestamp(r)); + } } } @Test public void testDirectLargeNegatives() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) - .stripeSize(100000) - .bufferSize(10000) - .encodingStrategy(encodingStrategy)); - - writer.addRow(-7486502418706614742L); - writer.addRow(0L); - writer.addRow(1L); - writer.addRow(1L); - writer.addRow(-5535739865598783616L); + .setSchema(schema) + .stripeSize(100000) + .bufferSize(10000) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(); + + appendLong(batch, -7486502418706614742L); + appendLong(batch, 0L); + appendLong(batch, 1L); + appendLong(batch, 1L); + appendLong(batch, -5535739865598783616L); + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); - Object row = rows.next(null); - assertEquals(-7486502418706614742L, ((LongWritable) row).get()); - row = rows.next(row); - assertEquals(0L, ((LongWritable) row).get()); - row = rows.next(row); - assertEquals(1L, ((LongWritable) row).get()); - row = rows.next(row); - assertEquals(1L, ((LongWritable) row).get()); - row = rows.next(row); - assertEquals(-5535739865598783616L, ((LongWritable) row).get()); + batch = reader.getSchema().createRowBatch(); + assertEquals(true, rows.nextBatch(batch)); + assertEquals(5, batch.size); + assertEquals(-7486502418706614742L, + ((LongColumnVector) batch.cols[0]).vector[0]); + assertEquals(0L, + ((LongColumnVector) batch.cols[0]).vector[1]); + assertEquals(1L, + ((LongColumnVector) batch.cols[0]).vector[2]); + assertEquals(1L, + ((LongColumnVector) batch.cols[0]).vector[3]); + assertEquals(-5535739865598783616L, + ((LongColumnVector) batch.cols[0]).vector[4]); + assertEquals(false, rows.nextBatch(batch)); } @Test public void testSeek() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); List input = Lists.newArrayList(); Random rand = new Random(); @@ -1317,26 +1343,31 @@ public void testSeek() throws Exception { input.add((long) rand.nextInt()); } Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .inspector(inspector) - .compress(CompressionKind.NONE) - .stripeSize(100000) - .bufferSize(10000) - .version(OrcFile.Version.V_0_11) - .encodingStrategy(encodingStrategy)); + OrcFile.writerOptions(conf) + .setSchema(schema) + .compress(CompressionKind.NONE) + .stripeSize(100000) + .bufferSize(10000) + .version(OrcFile.Version.V_0_11) + .encodingStrategy(encodingStrategy)); + VectorizedRowBatch batch = schema.createRowBatch(100000); for(Long l : input) { - writer.addRow(l); + appendLong(batch, l); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 55555; rows.seekToRow(idx); - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java b/orc/src/test/org/apache/orc/TestOrcNullOptimization.java similarity index 52% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java rename to orc/src/test/org/apache/orc/TestOrcNullOptimization.java index e96c80976c13..0b605c9fdc1d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java +++ b/orc/src/test/org/apache/orc/TestOrcNullOptimization.java @@ -15,32 +15,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.assertNotNull; -import static org.junit.Assert.assertNull; import java.io.File; -import java.util.ArrayList; +import java.io.IOException; import java.util.List; import java.util.Random; +import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.orc.ColumnStatistics; -import org.apache.orc.CompressionKind; -import org.apache.orc.IntegerColumnStatistics; -import org.apache.orc.OrcProto; - -import org.apache.orc.StringColumnStatistics; -import org.apache.orc.StripeInformation; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.apache.orc.impl.RecordReaderImpl; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -50,25 +44,58 @@ public class TestOrcNullOptimization { - public static class MyStruct { - Integer a; - String b; - Boolean c; - List list = new ArrayList(); - - public MyStruct(Integer a, String b, Boolean c, List l) { - this.a = a; - this.b = b; - this.c = c; - this.list = l; - } + TypeDescription createMyStruct() { + return TypeDescription.createStruct() + .addField("a", TypeDescription.createInt()) + .addField("b", TypeDescription.createString()) + .addField("c", TypeDescription.createBoolean()) + .addField("d", TypeDescription.createList( + TypeDescription.createStruct() + .addField("z", TypeDescription.createInt()))); } - public static class InnerStruct { - Integer z; - - public InnerStruct(int z) { - this.z = z; + void addRow(Writer writer, VectorizedRowBatch batch, + Integer a, String b, Boolean c, + Integer... d) throws IOException { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + int row = batch.size++; + LongColumnVector aColumn = (LongColumnVector) batch.cols[0]; + BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1]; + LongColumnVector cColumn = (LongColumnVector) batch.cols[2]; + ListColumnVector dColumn = (ListColumnVector) batch.cols[3]; + StructColumnVector dStruct = (StructColumnVector) dColumn.child; + LongColumnVector dInt = (LongColumnVector) dStruct.fields[0]; + if (a == null) { + aColumn.noNulls = false; + aColumn.isNull[row] = true; + } else { + aColumn.vector[row] = a; + } + if (b == null) { + bColumn.noNulls = false; + bColumn.isNull[row] = true; + } else { + bColumn.setVal(row, b.getBytes()); + } + if (c == null) { + cColumn.noNulls = false; + cColumn.isNull[row] = true; + } else { + cColumn.vector[row] = c ? 1 : 0; + } + if (d == null) { + dColumn.noNulls = false; + dColumn.isNull[row] = true; + } else { + dColumn.offsets[row] = dColumn.childCount; + dColumn.lengths[row] = d.length; + dColumn.childCount += d.length; + for(int e=0; e < d.length; ++e) { + dInt.vector[(int) dColumn.offsets[row] + e] = d[e]; + } } } @@ -93,26 +120,21 @@ public void openFileSystem() throws Exception { @Test public void testMultiStripeWithNull() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcNullOptimization.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = createMyStruct(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000)); Random rand = new Random(100); - writer.addRow(new MyStruct(null, null, true, - Lists.newArrayList(new InnerStruct(100)))); + VectorizedRowBatch batch = schema.createRowBatch(); + addRow(writer, batch, null, null, true, 100); for (int i = 2; i < 20000; i++) { - writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists - .newArrayList(new InnerStruct(100)))); + addRow(writer, batch, rand.nextInt(1), "a", true, 100); } - writer.addRow(new MyStruct(null, null, true, - Lists.newArrayList(new InnerStruct(100)))); + addRow(writer, batch, null, null, true, 100); + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, @@ -136,12 +158,8 @@ public void testMultiStripeWithNull() throws Exception { stats[2].toString()); // check the inspectors - StructObjectInspector readerInspector = - (StructObjectInspector) reader.getObjectInspector(); - assertEquals(ObjectInspector.Category.STRUCT, - readerInspector.getCategory()); - assertEquals("struct>>", - readerInspector.getTypeName()); + assertEquals("struct>>", + reader.getSchema().toString()); RecordReader rows = reader.rows(); @@ -163,60 +181,64 @@ public void testMultiStripeWithNull() throws Exception { } assertEquals(expected, got); + batch = reader.getSchema().createRowBatch(); + LongColumnVector aColumn = (LongColumnVector) batch.cols[0]; + BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1]; + LongColumnVector cColumn = (LongColumnVector) batch.cols[2]; + ListColumnVector dColumn = (ListColumnVector) batch.cols[3]; + LongColumnVector dElements = + (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]); + assertEquals(true , rows.nextBatch(batch)); + assertEquals(1024, batch.size); + // row 1 - OrcStruct row = (OrcStruct) rows.next(null); - assertNotNull(row); - assertNull(row.getFieldValue(0)); - assertNull(row.getFieldValue(1)); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(true, aColumn.isNull[0]); + assertEquals(true, bColumn.isNull[0]); + assertEquals(1, cColumn.vector[0]); + assertEquals(0, dColumn.offsets[0]); + assertEquals(1, dColumn.lengths[1]); + assertEquals(100, dElements.vector[0]); rows.seekToRow(19998); + rows.nextBatch(batch); + assertEquals(2, batch.size); + // last-1 row - row = (OrcStruct) rows.next(null); - assertNotNull(row); - assertNotNull(row.getFieldValue(1)); - assertEquals(new IntWritable(0), row.getFieldValue(0)); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(0, aColumn.vector[0]); + assertEquals("a", bColumn.toString(0)); + assertEquals(1, cColumn.vector[0]); + assertEquals(0, dColumn.offsets[0]); + assertEquals(1, dColumn.lengths[0]); + assertEquals(100, dElements.vector[0]); // last row - row = (OrcStruct) rows.next(row); - assertNotNull(row); - assertNull(row.getFieldValue(0)); - assertNull(row.getFieldValue(1)); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); - + assertEquals(true, aColumn.isNull[1]); + assertEquals(true, bColumn.isNull[1]); + assertEquals(1, cColumn.vector[1]); + assertEquals(1, dColumn.offsets[1]); + assertEquals(1, dColumn.lengths[1]); + assertEquals(100, dElements.vector[1]); + + assertEquals(false, rows.nextBatch(batch)); rows.close(); } @Test public void testMultiStripeWithoutNull() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcNullOptimization.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = createMyStruct(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(10000)); Random rand = new Random(100); + VectorizedRowBatch batch = schema.createRowBatch(); for (int i = 1; i < 20000; i++) { - writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists - .newArrayList(new InnerStruct(100)))); + addRow(writer, batch, rand.nextInt(1), "a", true, 100); } - writer.addRow(new MyStruct(0, "b", true, - Lists.newArrayList(new InnerStruct(100)))); + addRow(writer, batch, 0, "b", true, 100); + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, @@ -240,12 +262,8 @@ public void testMultiStripeWithoutNull() throws Exception { stats[2].toString()); // check the inspectors - StructObjectInspector readerInspector = - (StructObjectInspector) reader.getObjectInspector(); - assertEquals(ObjectInspector.Category.STRUCT, - readerInspector.getCategory()); - assertEquals("struct>>", - readerInspector.getTypeName()); + Assert.assertEquals("struct>>", + reader.getSchema().toString()); RecordReader rows = reader.rows(); @@ -266,58 +284,54 @@ public void testMultiStripeWithoutNull() throws Exception { assertEquals(expected, got); rows.seekToRow(19998); + + batch = reader.getSchema().createRowBatch(); + LongColumnVector aColumn = (LongColumnVector) batch.cols[0]; + BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1]; + LongColumnVector cColumn = (LongColumnVector) batch.cols[2]; + ListColumnVector dColumn = (ListColumnVector) batch.cols[3]; + LongColumnVector dElements = + (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]); + + assertEquals(true, rows.nextBatch(batch)); + assertEquals(2, batch.size); + // last-1 row - OrcStruct row = (OrcStruct) rows.next(null); - assertNotNull(row); - assertNotNull(row.getFieldValue(1)); - assertEquals(new IntWritable(0), row.getFieldValue(0)); - assertEquals("a", row.getFieldValue(1).toString()); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(0, aColumn.vector[0]); + assertEquals("a", bColumn.toString(0)); + assertEquals(1, cColumn.vector[0]); + assertEquals(0, dColumn.offsets[0]); + assertEquals(1, dColumn.lengths[0]); + assertEquals(100, dElements.vector[0]); // last row - row = (OrcStruct) rows.next(row); - assertNotNull(row); - assertNotNull(row.getFieldValue(0)); - assertNotNull(row.getFieldValue(1)); - assertEquals("b", row.getFieldValue(1).toString()); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(0, aColumn.vector[1]); + assertEquals("b", bColumn.toString(1)); + assertEquals(1, cColumn.vector[1]); + assertEquals(1, dColumn.offsets[1]); + assertEquals(1, dColumn.lengths[1]); + assertEquals(100, dElements.vector[1]); rows.close(); } @Test public void testColumnsWithNullAndCompression() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcNullOptimization.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = createMyStruct(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .bufferSize(10000)); - writer.addRow(new MyStruct(3, "a", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(null, "b", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(3, null, false, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(3, "d", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(2, "e", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(2, "f", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(2, "g", true, - Lists.newArrayList(new InnerStruct(100)))); - writer.addRow(new MyStruct(2, "h", true, - Lists.newArrayList(new InnerStruct(100)))); + VectorizedRowBatch batch = schema.createRowBatch(); + addRow(writer, batch, 3, "a", true, 100); + addRow(writer, batch, null, "b", true, 100); + addRow(writer, batch, 3, null, false, 100); + addRow(writer, batch, 3, "d", true, 100); + addRow(writer, batch, 2, "e", true, 100); + addRow(writer, batch, 2, "f", true, 100); + addRow(writer, batch, 2, "g", true, 100); + addRow(writer, batch, 2, "h", true, 100); + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, @@ -341,12 +355,15 @@ public void testColumnsWithNullAndCompression() throws Exception { stats[2].toString()); // check the inspectors - StructObjectInspector readerInspector = - (StructObjectInspector) reader.getObjectInspector(); - assertEquals(ObjectInspector.Category.STRUCT, - readerInspector.getCategory()); - assertEquals("struct>>", - readerInspector.getTypeName()); + batch = reader.getSchema().createRowBatch(); + LongColumnVector aColumn = (LongColumnVector) batch.cols[0]; + BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1]; + LongColumnVector cColumn = (LongColumnVector) batch.cols[2]; + ListColumnVector dColumn = (ListColumnVector) batch.cols[3]; + LongColumnVector dElements = + (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]); + Assert.assertEquals("struct>>", + reader.getSchema().toString()); RecordReader rows = reader.rows(); // only the last strip will have PRESENT stream @@ -366,35 +383,33 @@ public void testColumnsWithNullAndCompression() throws Exception { } assertEquals(expected, got); + assertEquals(true, rows.nextBatch(batch)); + assertEquals(8, batch.size); + // row 1 - OrcStruct row = (OrcStruct) rows.next(null); - assertNotNull(row); - assertEquals(new IntWritable(3), row.getFieldValue(0)); - assertEquals("a", row.getFieldValue(1).toString()); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(3, aColumn.vector[0]); + assertEquals("a", bColumn.toString(0)); + assertEquals(1, cColumn.vector[0]); + assertEquals(0, dColumn.offsets[0]); + assertEquals(1, dColumn.lengths[0]); + assertEquals(100, dElements.vector[0]); // row 2 - row = (OrcStruct) rows.next(row); - assertNotNull(row); - assertNull(row.getFieldValue(0)); - assertEquals("b", row.getFieldValue(1).toString()); - assertEquals(new BooleanWritable(true), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(true, aColumn.isNull[1]); + assertEquals("b", bColumn.toString(1)); + assertEquals(1, cColumn.vector[1]); + assertEquals(1, dColumn.offsets[1]); + assertEquals(1, dColumn.lengths[1]); + assertEquals(100, dElements.vector[1]); // row 3 - row = (OrcStruct) rows.next(row); - assertNotNull(row); - assertNull(row.getFieldValue(1)); - assertEquals(new IntWritable(3), row.getFieldValue(0)); - assertEquals(new BooleanWritable(false), row.getFieldValue(2)); - assertEquals(new IntWritable(100), - ((OrcStruct) ((ArrayList) row.getFieldValue(3)).get(0)). - getFieldValue(0)); + assertEquals(3, aColumn.vector[2]); + assertEquals(true, bColumn.isNull[2]); + assertEquals(0, cColumn.vector[2]); + assertEquals(2, dColumn.offsets[2]); + assertEquals(1, dColumn.lengths[2]); + assertEquals(100, dElements.vector[2]); + rows.close(); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java b/orc/src/test/org/apache/orc/TestOrcTimezone1.java similarity index 73% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java rename to orc/src/test/org/apache/orc/TestOrcTimezone1.java index 526c3572f808..72dc455069ef 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java +++ b/orc/src/test/org/apache/orc/TestOrcTimezone1.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.assertNotNull; @@ -27,16 +27,12 @@ import java.util.List; import java.util.TimeZone; +import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; -import org.apache.hive.common.util.HiveTestUtils; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.junit.After; import org.junit.Before; import org.junit.Rule; @@ -114,15 +110,12 @@ public void restoreTimeZone() { @Test public void testTimestampWriter() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Timestamp.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createTimestamp(); TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000)); + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); assertEquals(writerTimeZone, TimeZone.getDefault().getID()); List ts = Lists.newArrayList(); ts.add("2003-01-01 01:00:00.000000222"); @@ -138,21 +131,26 @@ public void testTimestampWriter() throws Exception { ts.add("2008-10-02 11:00:00.0"); ts.add("2037-01-01 00:00:00.000999"); ts.add("2014-03-28 00:00:00.0"); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; for (String t : ts) { - writer.addRow(Timestamp.valueOf(t)); + times.set(batch.size++, Timestamp.valueOf(t)); } + writer.addRowBatch(batch); writer.close(); TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(readerTimeZone, TimeZone.getDefault().getID()); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + times = (TimestampColumnVector) batch.cols[0]; int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - Timestamp got = ((TimestampWritable) row).getTimestamp(); - assertEquals(ts.get(idx++), got.toString()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString()); + } } rows.close(); } @@ -160,35 +158,32 @@ public void testTimestampWriter() throws Exception { @Test public void testReadTimestampFormat_0_11() throws Exception { TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); - Path oldFilePath = - new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")); + Path oldFilePath = new Path(getClass().getClassLoader(). + getSystemResource("orc-file-11-format.orc").getPath()); Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - - StructObjectInspector readerInspector = (StructObjectInspector) reader - .getObjectInspector(); - List fields = readerInspector - .getAllStructFieldRefs(); - TimestampObjectInspector tso = (TimestampObjectInspector) readerInspector - .getStructFieldRef("ts").getFieldObjectInspector(); - - RecordReader rows = reader.rows(); - Object row = rows.next(null); - assertNotNull(row); + TypeDescription schema = reader.getSchema(); + int col = schema.getFieldNames().indexOf("ts"); + VectorizedRowBatch batch = schema.createRowBatch(10); + TimestampColumnVector ts = (TimestampColumnVector) batch.cols[col]; + + boolean[] include = new boolean[schema.getMaximumId() + 1]; + include[schema.getChildren().get(col).getId()] = true; + RecordReader rows = reader.rows + (new Reader.Options().include(include)); + assertEquals(true, rows.nextBatch(batch)); assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"), - tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, - fields.get(12)))); - + ts.asScratchTimestamp(0)); + // check the contents of second row - assertEquals(true, rows.hasNext()); rows.seekToRow(7499); - row = rows.next(null); + assertEquals(true, rows.nextBatch(batch)); + assertEquals(1, batch.size); assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"), - tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, - fields.get(12)))); - + ts.asScratchTimestamp(0)); + // handle the close up - assertEquals(false, rows.hasNext()); + Assert.assertEquals(false, rows.nextBatch(batch)); rows.close(); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java b/orc/src/test/org/apache/orc/TestOrcTimezone2.java similarity index 82% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java rename to orc/src/test/org/apache/orc/TestOrcTimezone2.java index 3eae4a95511f..4a0285521adb 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java +++ b/orc/src/test/org/apache/orc/TestOrcTimezone2.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static junit.framework.Assert.assertEquals; @@ -30,9 +30,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.junit.After; import org.junit.Before; import org.junit.Rule; @@ -98,15 +97,12 @@ public void restoreTimeZone() { @Test public void testTimestampWriter() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Timestamp.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createTimestamp(); TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000)); + OrcFile.writerOptions(conf).setSchema(schema) + .stripeSize(100000).bufferSize(10000)); assertEquals(writerTimeZone, TimeZone.getDefault().getID()); List ts = Lists.newArrayList(); ts.add("2003-01-01 01:00:00.000000222"); @@ -121,21 +117,26 @@ public void testTimestampWriter() throws Exception { ts.add("1998-11-02 10:00:00.857340643"); ts.add("2008-10-02 11:00:00.0"); ts.add("2037-01-01 00:00:00.000999"); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0]; for (String t : ts) { - writer.addRow(Timestamp.valueOf(t)); + tsc.set(batch.size++, Timestamp.valueOf(t)); } + writer.addRowBatch(batch); writer.close(); TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(readerTimeZone, TimeZone.getDefault().getID()); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - Timestamp got = ((TimestampWritable) row).getTimestamp(); - assertEquals(ts.get(idx++), got.toString()); + batch = reader.getSchema().createRowBatch(); + tsc = (TimestampColumnVector) batch.cols[0]; + while (rows.nextBatch(batch)) { + for (int r=0; r < batch.size; ++r) { + assertEquals(ts.get(idx++), tsc.asScratchTimestamp(r).toString()); + } } rows.close(); } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java b/orc/src/test/org/apache/orc/TestStringDictionary.java similarity index 61% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java rename to orc/src/test/org/apache/orc/TestStringDictionary.java index 41a211bb5853..46209bbbb3d5 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java +++ b/orc/src/test/org/apache/orc/TestStringDictionary.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static org.junit.Assert.assertEquals; @@ -25,14 +25,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.io.Text; -import org.apache.orc.CompressionKind; -import org.apache.orc.OrcProto; - -import org.apache.orc.StripeInformation; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.apache.orc.impl.RecordReaderImpl; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -60,27 +56,34 @@ public void openFileSystem() throws Exception { @Test public void testTooManyDistinct() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createString(); Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE) - .bufferSize(10000)); + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) + .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector col = (BytesColumnVector) batch.cols[0]; for (int i = 0; i < 20000; i++) { - writer.addRow(new Text(String.valueOf(i))); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + col.setVal(batch.size++, String.valueOf(i).getBytes()); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + col = (BytesColumnVector) batch.cols[0]; int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new Text(String.valueOf(idx++)), row); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), col.toString(r)); + } } // make sure the encoding type is correct @@ -97,15 +100,11 @@ public void testTooManyDistinct() throws Exception { @Test public void testHalfDistinct() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createString(); Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE) + OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE) .bufferSize(10000)); Random rand = new Random(123); int[] input = new int[20000]; @@ -113,17 +112,27 @@ public void testHalfDistinct() throws Exception { input[i] = rand.nextInt(10000); } + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector col = (BytesColumnVector) batch.cols[0]; for (int i = 0; i < 20000; i++) { - writer.addRow(new Text(String.valueOf(input[i]))); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + col.setVal(batch.size++, String.valueOf(input[i]).getBytes()); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + col = (BytesColumnVector) batch.cols[0]; int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new Text(String.valueOf(input[idx++])), row); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(input[idx++]), col.toString(r)); + } } // make sure the encoding type is correct @@ -140,28 +149,34 @@ public void testHalfDistinct() throws Exception { @Test public void testTooManyDistinctCheckDisabled() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createString(); - conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false); + conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false); Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE) + OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE) .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; for (int i = 0; i < 20000; i++) { - writer.addRow(new Text(String.valueOf(i))); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(i).getBytes()); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new Text(String.valueOf(idx++)), row); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), string.toString(r)); + } } // make sure the encoding type is correct @@ -178,34 +193,41 @@ public void testTooManyDistinctCheckDisabled() throws Exception { @Test public void testHalfDistinctCheckDisabled() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createString(); - conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false); + conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), + false); Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE) + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) .bufferSize(10000)); Random rand = new Random(123); int[] input = new int[20000]; for (int i = 0; i < 20000; i++) { input[i] = rand.nextInt(10000); } - + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; for (int i = 0; i < 20000; i++) { - writer.addRow(new Text(String.valueOf(input[i]))); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(input[i]).getBytes()); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new Text(String.valueOf(input[idx++])), row); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(input[idx++]), string.toString(r)); + } } // make sure the encoding type is correct @@ -222,27 +244,34 @@ public void testHalfDistinctCheckDisabled() throws Exception { @Test public void testTooManyDistinctV11AlwaysDictionary() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createString(); Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE) + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) .version(OrcFile.Version.V_0_11).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; for (int i = 0; i < 20000; i++) { - writer.addRow(new Text(String.valueOf(i))); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(i).getBytes()); } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; RecordReader rows = reader.rows(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(new Text(String.valueOf(idx++)), row); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), string.toString(r)); + } } // make sure the encoding type is correct diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java b/orc/src/test/org/apache/orc/TestTypeDescription.java similarity index 98% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java rename to orc/src/test/org/apache/orc/TestTypeDescription.java index 96af65adaf94..0ac1e64ef285 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java +++ b/orc/src/test/org/apache/orc/TestTypeDescription.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static org.junit.Assert.assertEquals; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java similarity index 81% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java rename to orc/src/test/org/apache/orc/TestUnrolledBitPack.java index 325173144c15..ef8fcd0c75fc 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java +++ b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import static org.junit.Assert.assertEquals; @@ -28,10 +28,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.io.LongWritable; -import org.apache.orc.CompressionKind; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -80,11 +78,7 @@ public void openFileSystem() throws Exception { @Test public void testBitPacking() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, - ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = TypeDescription.createLong(); long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0, val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, @@ -95,19 +89,25 @@ public void testBitPacking() throws Exception { Writer writer = OrcFile.createWriter( testFilePath, - OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000) + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); for (Long l : input) { - writer.addRow(l); + int row = batch.size++; + ((LongColumnVector) batch.cols[0]).vector[row] = l; } + writer.addRowBatch(batch); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); int idx = 0; - while (rows.hasNext()) { - Object row = rows.next(null); - assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java b/orc/src/test/org/apache/orc/TestVectorOrcFile.java similarity index 95% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java rename to orc/src/test/org/apache/orc/TestVectorOrcFile.java index 65896920e815..112edb91b792 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java +++ b/orc/src/test/org/apache/orc/TestVectorOrcFile.java @@ -16,10 +16,11 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc; import com.google.common.collect.Lists; +import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -41,25 +42,12 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.hive.common.util.HiveTestUtils; -import org.apache.orc.BinaryColumnStatistics; -import org.apache.orc.BooleanColumnStatistics; -import org.apache.orc.ColumnStatistics; -import org.apache.orc.CompressionKind; -import org.apache.orc.DataReader; -import org.apache.orc.DecimalColumnStatistics; -import org.apache.orc.DoubleColumnStatistics; -import org.apache.orc.IntegerColumnStatistics; import org.apache.orc.impl.DataReaderProperties; import org.apache.orc.impl.MemoryManager; import org.apache.orc.impl.OrcIndex; -import org.apache.orc.OrcProto; -import org.apache.orc.OrcUtils; -import org.apache.orc.StringColumnStatistics; -import org.apache.orc.StripeInformation; -import org.apache.orc.StripeStatistics; -import org.apache.orc.TypeDescription; -import org.apache.orc.Writer; +import org.apache.orc.impl.RecordReaderImpl; +import org.apache.orc.impl.RecordReaderUtils; +import org.apache.orc.tools.TestJsonFileDump; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -79,10 +67,10 @@ import java.util.Map; import java.util.Random; -import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.assertNotNull; -import static junit.framework.Assert.assertNull; -import static junit.framework.Assert.assertTrue; +import static junit.framework.TestCase.assertNotNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; /** * Tests for the vectorized reader and writer for ORC files. @@ -181,7 +169,7 @@ public void openFileSystem () throws Exception { @Test public void testReadFormat_0_11() throws Exception { Path oldFilePath = - new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")); + new Path(TestJsonFileDump.getFileFromClasspath("orc-file-11-format.orc")); Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs)); @@ -200,7 +188,7 @@ public void testReadFormat_0_11() throws Exception { + stripe.getFooterLength(); } } - assertEquals(reader.getNumberOfRows(), rowCount); + Assert.assertEquals(reader.getNumberOfRows(), rowCount); assertEquals(2, stripeCount); // check the stats @@ -226,8 +214,8 @@ public void testReadFormat_0_11() throws Exception { "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString()); - assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum()); - assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum()); + assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001); + assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001); assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001); assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", @@ -247,7 +235,7 @@ public void testReadFormat_0_11() throws Exception { VectorizedRowBatch batch = schema.createRowBatch(); RecordReader rows = reader.rows(); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); // check the contents of the first row @@ -300,7 +288,7 @@ public void testReadFormat_0_11() throws Exception { // check the contents of row 7499 rows.seekToRow(7499); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(true, getBoolean(batch, 0)); assertEquals(100, getByte(batch, 0)); assertEquals(2048, getShort(batch, 0)); @@ -339,7 +327,7 @@ public void testReadFormat_0_11() throws Exception { decs.vector[0]); // handle the close up - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); rows.close(); } @@ -387,7 +375,7 @@ public void testTimestamp() throws Exception { timestamps.asScratchTimestamp(r).getNanos()); } } - assertEquals(tslist.size(), rows.getRowNumber()); + Assert.assertEquals(tslist.size(), rows.getRowNumber()); assertEquals(0, writer.getSchema().getMaximumId()); boolean[] expected = new boolean[] {false}; boolean[] included = OrcUtils.includeColumns("", writer.getSchema()); @@ -460,7 +448,7 @@ public void testStringAndBinaryStatistics() throws Exception { BytesColumnVector bytes = (BytesColumnVector) batch.cols[0]; BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; RecordReader rows = reader.rows(); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(4, batch.size); // check the contents of the first row @@ -480,7 +468,7 @@ public void testStringAndBinaryStatistics() throws Exception { assertEquals("hi", strs.toString(3)); // handle the close up - assertEquals(false, rows.hasNext()); + Assert.assertEquals(false, rows.nextBatch(batch)); rows.close(); } @@ -1012,8 +1000,8 @@ public void test1() throws Exception { assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum()); assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum()); assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum()); - assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum()); - assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum()); + assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001); + assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001); assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001); assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0", stats[7].toString()); @@ -1062,9 +1050,9 @@ public void test1() throws Exception { RecordReader rows = reader.rows(); // create a new batch batch = readerSchema.createRowBatch(); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(2, batch.size); - assertEquals(false, rows.hasNext()); + Assert.assertEquals(false, rows.nextBatch(batch)); // check the contents of the first row assertEquals(false, getBoolean(batch, 0)); @@ -1127,7 +1115,7 @@ public void test1() throws Exception { assertEquals("mauddib", value.string1.toString()); // handle the close up - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); rows.close(); } @@ -1201,15 +1189,17 @@ public void testColumnProjection() throws Exception { assertEquals("struct", type.toString()); // read the contents and make sure they match - RecordReader rows1 = reader.rows(new boolean[]{true, true, false}); - RecordReader rows2 = reader.rows(new boolean[]{true, false, true}); + RecordReader rows1 = reader.rows( + new Reader.Options().include(new boolean[]{true, true, false})); + RecordReader rows2 = reader.rows( + new Reader.Options().include(new boolean[]{true, false, true})); r1 = new Random(1); r2 = new Random(2); VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000); VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000); for(int i = 0; i < 21000; i += 1000) { - assertEquals(true, rows1.nextBatch(batch1)); - assertEquals(true, rows2.nextBatch(batch2)); + Assert.assertEquals(true, rows1.nextBatch(batch1)); + Assert.assertEquals(true, rows2.nextBatch(batch2)); assertEquals(1000, batch1.size); assertEquals(1000, batch2.size); for(int j=0; j < 1000; ++j) { @@ -1219,8 +1209,8 @@ public void testColumnProjection() throws Exception { ((BytesColumnVector) batch2.cols[1]).toString(j)); } } - assertEquals(false, rows1.nextBatch(batch1)); - assertEquals(false, rows2.nextBatch(batch2)); + Assert.assertEquals(false, rows1.nextBatch(batch1)); + Assert.assertEquals(false, rows2.nextBatch(batch2)); rows1.close(); rows2.close(); } @@ -1237,13 +1227,14 @@ public void testEmptyFile() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(false, reader.rows().hasNext()); - assertEquals(CompressionKind.NONE, reader.getCompressionKind()); - assertEquals(0, reader.getNumberOfRows()); - assertEquals(0, reader.getCompressionSize()); - assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); - assertEquals(3, reader.getContentLength()); - assertEquals(false, reader.getStripes().iterator().hasNext()); + VectorizedRowBatch batch = reader.getSchema().createRowBatch(); + Assert.assertEquals(false, reader.rows().nextBatch(batch)); + Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind()); + Assert.assertEquals(0, reader.getNumberOfRows()); + Assert.assertEquals(0, reader.getCompressionSize()); + Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); + Assert.assertEquals(3, reader.getContentLength()); + Assert.assertEquals(false, reader.getStripes().iterator().hasNext()); } @Test @@ -1275,10 +1266,10 @@ public void metaData() throws Exception { Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(byteBuf(5,7,11,13,17,19), reader.getMetadataValue("clobber")); - assertEquals(byteBuf(1,2,3,4,5,6,7,-1,-2,127,-128), + Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber")); + Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128), reader.getMetadataValue("my.meta")); - assertEquals(bigBuf, reader.getMetadataValue("big")); + Assert.assertEquals(bigBuf, reader.getMetadataValue("big")); try { reader.getMetadataValue("unknown"); assertTrue(false); @@ -1505,8 +1496,8 @@ public void testUnionAndTimestamp() throws Exception { included = OrcUtils.includeColumns("union", schema); assertEquals(true, Arrays.equals(expected, included)); - assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); - assertEquals(5077, reader.getNumberOfRows()); + Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); + Assert.assertEquals(5077, reader.getNumberOfRows()); DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5]; assertEquals(71, stats.getNumberOfValues()); @@ -1527,19 +1518,19 @@ public void testUnionAndTimestamp() throws Exception { currentOffset += stripe.getLength(); } } - assertEquals(reader.getNumberOfRows(), rowCount); + Assert.assertEquals(reader.getNumberOfRows(), rowCount); assertEquals(2, stripeCount); - assertEquals(reader.getContentLength(), currentOffset); + Assert.assertEquals(reader.getContentLength(), currentOffset); RecordReader rows = reader.rows(); - assertEquals(0, rows.getRowNumber()); - assertEquals(0.0, rows.getProgress(), 0.000001); + Assert.assertEquals(0, rows.getRowNumber()); + Assert.assertEquals(0.0, rows.getProgress(), 0.000001); schema = reader.getSchema(); batch = schema.createRowBatch(74); - assertEquals(0, rows.getRowNumber()); + Assert.assertEquals(0, rows.getRowNumber()); rows.nextBatch(batch); assertEquals(74, batch.size); - assertEquals(74, rows.getRowNumber()); + Assert.assertEquals(74, rows.getRowNumber()); TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0]; UnionColumnVector union = (UnionColumnVector) batch.cols[1]; LongColumnVector longs = (LongColumnVector) union.fields[0]; @@ -1633,8 +1624,8 @@ public void testUnionAndTimestamp() throws Exception { rows.nextBatch(batch); assertEquals(0, batch.size); - assertEquals(1.0, rows.getProgress(), 0.00001); - assertEquals(reader.getNumberOfRows(), rows.getRowNumber()); + Assert.assertEquals(1.0, rows.getProgress(), 0.00001); + Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber()); rows.seekToRow(1); rows.nextBatch(batch); assertEquals(1000, batch.size); @@ -1672,7 +1663,7 @@ public void testSnappy() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); + Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(1000); rand = new Random(12); @@ -1720,8 +1711,8 @@ public void testWithoutIndex() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(50000, reader.getNumberOfRows()); - assertEquals(0, reader.getRowIndexStride()); + Assert.assertEquals(50000, reader.getNumberOfRows()); + Assert.assertEquals(0, reader.getRowIndexStride()); StripeInformation stripe = reader.getStripes().iterator().next(); assertEquals(true, stripe.getDataLength() != 0); assertEquals(0, stripe.getIndexLength()); @@ -1793,7 +1784,7 @@ public void testSeek() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(COUNT, reader.getNumberOfRows()); + Assert.assertEquals(COUNT, reader.getNumberOfRows()); RecordReader rows = reader.rows(); // get the row index DataReader meta = RecordReaderUtils.createDefaultDataReader( @@ -1822,7 +1813,7 @@ public void testSeek() throws Exception { if (nextRowInBatch < 0) { long base = Math.max(i - 1023, 0); rows.seekToRow(base); - assertEquals("row " + i, true, rows.nextBatch(batch)); + Assert.assertEquals("row " + i, true, rows.nextBatch(batch)); nextRowInBatch = batch.size - 1; } checkRandomRow(batch, intValues, doubleValues, @@ -1848,18 +1839,18 @@ public void testSeek() throws Exception { boolean[] columns = new boolean[reader.getStatistics().length]; columns[5] = true; // long colulmn columns[9] = true; // text column - rows = reader.rowsOptions(new Reader.Options() + rows = reader.rows(new Reader.Options() .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2) .include(columns)); rows.seekToRow(lastRowOfStripe2); // we only want two rows batch = reader.getSchema().createRowBatch(2); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1, batch.size); assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0)); assertEquals(stringValues[(int) lastRowOfStripe2], getText(batch, 0).toString()); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0)); assertEquals(stringValues[(int) lastRowOfStripe2 + 1], getText(batch, 0).toString()); @@ -2048,7 +2039,7 @@ public void testPredicatePushdown() throws Exception { .lessThan("int1", PredicateLeaf.Type.LONG, 600000L) .end() .build(); - RecordReader rows = reader.rowsOptions(new Reader.Options() + RecordReader rows = reader.rows(new Reader.Options() .range(0L, Long.MAX_VALUE) .include(new boolean[]{true, true, true}) .searchArgument(sarg, new String[]{null, "int1", "string1"})); @@ -2056,16 +2047,16 @@ public void testPredicatePushdown() throws Exception { LongColumnVector ints = (LongColumnVector) batch.cols[0]; BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; - assertEquals(1000L, rows.getRowNumber()); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(1000L, rows.getRowNumber()); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1000, batch.size); for(int i=1000; i < 2000; ++i) { assertEquals(300 * i, ints.vector[i - 1000]); assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000)); } - assertEquals(false, rows.nextBatch(batch)); - assertEquals(3500, rows.getRowNumber()); + Assert.assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(3500, rows.getRowNumber()); // look through the file with no rows selected sarg = SearchArgumentFactory.newBuilder() @@ -2073,12 +2064,12 @@ public void testPredicatePushdown() throws Exception { .lessThan("int1", PredicateLeaf.Type.LONG, 0L) .end() .build(); - rows = reader.rowsOptions(new Reader.Options() + rows = reader.rows(new Reader.Options() .range(0L, Long.MAX_VALUE) .include(new boolean[]{true, true, true}) .searchArgument(sarg, new String[]{null, "int1", "string1"})); - assertEquals(3500L, rows.getRowNumber()); - assertTrue(!rows.hasNext()); + Assert.assertEquals(3500L, rows.getRowNumber()); + assertTrue(!rows.nextBatch(batch)); // select first 100 and last 100 rows sarg = SearchArgumentFactory.newBuilder() @@ -2089,28 +2080,28 @@ public void testPredicatePushdown() throws Exception { .end() .end() .build(); - rows = reader.rowsOptions(new Reader.Options() + rows = reader.rows(new Reader.Options() .range(0L, Long.MAX_VALUE) .include(new boolean[]{true, true, true}) .searchArgument(sarg, new String[]{null, "int1", "string1"})); - assertEquals(0, rows.getRowNumber()); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(0, rows.getRowNumber()); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1000, batch.size); - assertEquals(3000, rows.getRowNumber()); + Assert.assertEquals(3000, rows.getRowNumber()); for(int i=0; i < 1000; ++i) { assertEquals(300 * i, ints.vector[i]); assertEquals(Integer.toHexString(10*i), strs.toString(i)); } - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(500, batch.size); - assertEquals(3500, rows.getRowNumber()); + Assert.assertEquals(3500, rows.getRowNumber()); for(int i=3000; i < 3500; ++i) { assertEquals(300 * i, ints.vector[i - 3000]); assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000)); } - assertEquals(false, rows.nextBatch(batch)); - assertEquals(3500, rows.getRowNumber()); + Assert.assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(3500, rows.getRowNumber()); } /** @@ -2321,7 +2312,7 @@ public void testRepeating() throws Exception { BytesColumnVector mapKeys = (BytesColumnVector) maps.keys; BytesColumnVector mapValues = (BytesColumnVector) maps.values; - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); // read the 1024 nulls @@ -2335,7 +2326,7 @@ public void testRepeating() throws Exception { } // read the 1024 repeat values - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int r=0; r < 1024; ++r) { assertEquals("row " + r, "Horton", bins.toString(r)); @@ -2368,7 +2359,7 @@ public void testRepeating() throws Exception { } // read the second set of 1024 nulls - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int f=0; f < batch.cols.length; ++f) { assertEquals("field " + f, @@ -2379,7 +2370,7 @@ public void testRepeating() throws Exception { true, batch.cols[f].isNull[0]); } - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int r=0; r < 1024; ++r) { String hex = Integer.toHexString(r); @@ -2416,7 +2407,7 @@ public void testRepeating() throws Exception { } // should have no more rows - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); } private static String makeString(BytesColumnVector vector, int row) { @@ -2459,7 +2450,7 @@ public void testStringPadding() throws Exception { OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(4, batch.size); // ORC currently trims the output strings. See HIVE-12286 assertEquals("", @@ -2509,19 +2500,19 @@ public void testNonDictionaryRepeatingString() throws Exception { OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int r=0; r < 1024; ++r) { assertEquals(Integer.toString(r * 10001), makeString((BytesColumnVector) batch.cols[0], r)); } - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int r=0; r < 1024; ++r) { assertEquals("Halloween", makeString((BytesColumnVector) batch.cols[0], r)); } - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); } @Test @@ -2607,7 +2598,7 @@ public void testUnions() throws Exception { UnionColumnVector union = (UnionColumnVector) batch.cols[0]; LongColumnVector ints = (LongColumnVector) union.fields[0]; LongColumnVector longs = (LongColumnVector) union.fields[1]; - assertEquals(true, rows.nextBatch(batch)); + Assert.assertEquals(true, rows.nextBatch(batch)); assertEquals(1024, batch.size); for(int r=0; r < 1024; ++r) { if (r < 200) { @@ -2634,7 +2625,7 @@ public void testUnions() throws Exception { assertEquals("row " + r, -r, longs.vector[r]); } } - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); } /** @@ -2707,7 +2698,7 @@ public void testLists() throws Exception { assertEquals("row " + r, "null", actual.toString()); } } - assertEquals(false, rows.nextBatch(batch)); + Assert.assertEquals(false, rows.nextBatch(batch)); } /** diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java similarity index 97% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java rename to orc/src/test/org/apache/orc/impl/TestOrcWideTable.java index da2c681d0421..289a86e31b8d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java +++ b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import static org.junit.Assert.assertEquals; @@ -61,4 +61,4 @@ public void testBufferSizeFor25000Col() throws IOException { assertEquals(4 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024, 25000, 256*1024)); } -} \ No newline at end of file +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java b/orc/src/test/org/apache/orc/impl/TestRLEv2.java similarity index 81% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java rename to orc/src/test/org/apache/orc/impl/TestRLEv2.java index 1a3559ea597a..e139619c92bc 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java +++ b/orc/src/test/org/apache/orc/impl/TestRLEv2.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import static org.junit.Assert.assertEquals; @@ -27,8 +27,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.tools.FileDump; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -53,22 +58,26 @@ public void openFileSystem () throws Exception { fs.delete(testFilePath, false); } + void appendInt(VectorizedRowBatch batch, int i) { + ((LongColumnVector) batch.cols[0]).vector[batch.size++] = i; + } + @Test public void testFixedDeltaZero() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5120; ++i) { - w.addRow(123); + appendInt(batch, 123); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -85,20 +94,20 @@ public void testFixedDeltaZero() throws Exception { @Test public void testFixedDeltaOne() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5120; ++i) { - w.addRow(i % 512); + appendInt(batch, i % 512); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -115,20 +124,20 @@ public void testFixedDeltaOne() throws Exception { @Test public void testFixedDeltaOneDescending() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5120; ++i) { - w.addRow(512 - (i % 512)); + appendInt(batch, 512 - (i % 512)); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -145,20 +154,20 @@ public void testFixedDeltaOneDescending() throws Exception { @Test public void testFixedDeltaLarge() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5120; ++i) { - w.addRow(i % 512 + ((i % 512 ) * 100)); + appendInt(batch, i % 512 + ((i % 512) * 100)); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -175,20 +184,20 @@ public void testFixedDeltaLarge() throws Exception { @Test public void testFixedDeltaLargeDescending() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5120; ++i) { - w.addRow((512 - i % 512) + ((i % 512 ) * 100)); + appendInt(batch, (512 - i % 512) + ((i % 512) * 100)); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -205,20 +214,20 @@ public void testFixedDeltaLargeDescending() throws Exception { @Test public void testShortRepeat() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - + VectorizedRowBatch batch = schema.createRowBatch(5120); for (int i = 0; i < 5; ++i) { - w.addRow(10); + appendInt(batch, 10); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -234,21 +243,21 @@ public void testShortRepeat() throws Exception { @Test public void testDeltaUnknownSign() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); - - w.addRow(0); + VectorizedRowBatch batch = schema.createRowBatch(5120); + appendInt(batch, 0); for (int i = 0; i < 511; ++i) { - w.addRow(i); + appendInt(batch, i); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; @@ -266,22 +275,23 @@ public void testDeltaUnknownSign() throws Exception { @Test public void testPatchedBase() throws Exception { - ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( - Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + TypeDescription schema = TypeDescription.createInt(); Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .compress(CompressionKind.NONE) - .inspector(inspector) + .setSchema(schema) .rowIndexStride(0) .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION) .version(OrcFile.Version.V_0_12) ); Random rand = new Random(123); - w.addRow(10000000); + VectorizedRowBatch batch = schema.createRowBatch(5120); + appendInt(batch, 10000000); for (int i = 0; i < 511; ++i) { - w.addRow(rand.nextInt(i+1)); + appendInt(batch, rand.nextInt(i+1)); } + w.addRowBatch(batch); w.close(); PrintStream origOut = System.out; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java similarity index 96% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java rename to orc/src/test/org/apache/orc/impl/TestReaderImpl.java index e0199d68a6f1..23d0dab2c43b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java +++ b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import java.io.ByteArrayInputStream; import java.io.EOFException; @@ -24,8 +24,9 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.Seekable; -import org.apache.hadoop.hive.ql.io.FileFormatException; +import org.apache.orc.FileFormatException; import org.apache.hadoop.io.Text; +import org.apache.orc.OrcFile; import org.junit.Test; import org.junit.Before; import org.junit.Rule; @@ -68,7 +69,7 @@ public void testEnsureOrcFooter011ORCFile() throws IOException { @Test public void testEnsureOrcFooterCorrectORCFooter() throws IOException { - prepareTestCase(composeContent("",OrcFile.MAGIC)); + prepareTestCase(composeContent("", OrcFile.MAGIC)); ReaderImpl.ensureOrcFooter(in, path, psLen, buffer); } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java similarity index 90% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java rename to orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index 8731be07d1b8..cdd62ac10d9b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java +++ b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import static junit.framework.Assert.assertEquals; import static org.hamcrest.core.Is.is; @@ -35,6 +35,7 @@ import java.util.ArrayList; import java.util.List; +import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; @@ -44,20 +45,21 @@ import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hive.common.util.HiveTestUtils; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl; import org.apache.orc.BloomFilterIO; -import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.Location; +import org.apache.orc.DataReader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.impl.RecordReaderImpl.Location; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; -import org.apache.hadoop.hive.ql.io.sarg.TestSearchArgumentImpl; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.orc.ColumnStatistics; -import org.apache.orc.DataReader; -import org.apache.orc.StripeInformation; -import org.apache.orc.TypeDescription; -import org.apache.orc.impl.ColumnStatisticsImpl; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; import org.apache.orc.OrcProto; import org.junit.Test; @@ -65,6 +67,17 @@ import org.mockito.Mockito; public class TestRecordReaderImpl { + /** + * Create a predicate leaf. This is used by another test. + */ + public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, + PredicateLeaf.Type type, + String columnName, + Object literal, + List literalList) { + return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, + literal, literalList); + } // can add .verboseLogging() to cause Mockito to log invocations private final MockSettings settings = Mockito.withSettings().verboseLogging(); @@ -359,21 +372,21 @@ public void testGetMax() throws Exception { @Test public void testPredEvalWithBooleanStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null)); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf( + pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null)); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf( + pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null)); @@ -383,34 +396,34 @@ public void testPredEvalWithBooleanStats() throws Exception { @Test public void testPredEvalWithIntStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( - PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); + PredicateLeaf pred = createPredicateLeaf( + PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); // Stats gets converted to column type. "15" is outside of "10" and "100" - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); // Integer stats will not be converted date because of days/seconds/millis ambiguity - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null)); @@ -418,39 +431,39 @@ public void testPredEvalWithIntStats() throws Exception { @Test public void testPredEvalWithDoubleStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0" - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); // Double is not converted to date type because of days/seconds/millis ambiguity - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null)); @@ -458,33 +471,33 @@ public void testPredEvalWithDoubleStats() throws Exception { @Test public void testPredEvalWithStringStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 100.0, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "100", null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null)); // IllegalArgumentException is thrown when converting String to Date, hence YES_NO - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null)); @@ -492,70 +505,70 @@ public void testPredEvalWithStringStats() throws Exception { @Test public void testPredEvalWithDateStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); // Date to Integer conversion is not possible. assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); // Date to Float conversion is also not possible. - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "1970-01-11", null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15.1", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "__a15__1", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "2000-01-16", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "1970-01-16", null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); // Date to Decimal conversion is also not possible. - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null)); @@ -563,39 +576,39 @@ public void testPredEvalWithDateStats() throws Exception { @Test public void testPredEvalWithDecimalStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); // "15" out of range of "10.0" and "100.0" - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); // Decimal to Date not possible. - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null)); @@ -603,29 +616,29 @@ public void testPredEvalWithDecimalStats() throws Exception { @Test public void testPredEvalWithTimestampStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); @@ -633,14 +646,14 @@ public void testPredEvalWithTimestampStats() throws Exception { RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L, 100 * 24L * 60L * 60L * 1000L), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null)); - pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null)); @@ -650,7 +663,7 @@ public void testPredEvalWithTimestampStats() throws Exception { @Test public void testEquals() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, @@ -669,7 +682,7 @@ public void testEquals() throws Exception { @Test public void testNullSafeEquals() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO, @@ -688,7 +701,7 @@ public void testNullSafeEquals() throws Exception { @Test public void testLessThan() throws Exception { - PredicateLeaf lessThan = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf lessThan = createPredicateLeaf (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, @@ -705,7 +718,7 @@ public void testLessThan() throws Exception { @Test public void testLessThanEquals() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, @@ -725,7 +738,7 @@ public void testIn() throws Exception { List args = new ArrayList(); args.add(10L); args.add(20L); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG, "x", null, args); assertEquals(TruthValue.YES_NULL, @@ -743,7 +756,7 @@ public void testBetween() throws Exception { List args = new ArrayList(); args.add(10L); args.add(20L); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG, "x", null, args); assertEquals(TruthValue.NO_NULL, @@ -764,7 +777,7 @@ public void testBetween() throws Exception { @Test public void testIsNull() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG, "x", null, null); assertEquals(TruthValue.YES_NO, @@ -774,7 +787,7 @@ public void testIsNull() throws Exception { @Test public void testEqualsWithNullInStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, @@ -793,7 +806,7 @@ public void testEqualsWithNullInStats() throws Exception { @Test public void testNullSafeEqualsWithNullInStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO, @@ -812,7 +825,7 @@ public void testNullSafeEqualsWithNullInStats() throws Exception { @Test public void testLessThanWithNullInStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, @@ -831,7 +844,7 @@ public void testLessThanWithNullInStats() throws Exception { @Test public void testLessThanEqualsWithNullInStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, @@ -853,7 +866,7 @@ public void testInWithNullInStats() throws Exception { List args = new ArrayList(); args.add("c"); args.add("f"); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING, "x", null, args); assertEquals(TruthValue.NO_NULL, // before & after @@ -875,7 +888,7 @@ public void testBetweenWithNullInStats() throws Exception { List args = new ArrayList(); args.add("c"); args.add("f"); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING, "x", null, args); assertEquals(TruthValue.YES_NULL, // before & after @@ -908,7 +921,7 @@ public void testBetweenWithNullInStats() throws Exception { @Test public void testIsNullWithNullInStats() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING, "x", null, null); assertEquals(TruthValue.YES_NO, @@ -1289,7 +1302,7 @@ public void testPartialPlanString() throws Exception { @Test public void testIntNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1304,7 +1317,7 @@ public void testIntNullSafeEqualsBloomFilter() throws Exception { @Test public void testIntEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1322,7 +1335,7 @@ public void testIntInBloomFilter() throws Exception { List args = new ArrayList(); args.add(15L); args.add(19L); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1341,7 +1354,7 @@ public void testIntInBloomFilter() throws Exception { @Test public void testDoubleNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1356,7 +1369,7 @@ public void testDoubleNullSafeEqualsBloomFilter() throws Exception { @Test public void testDoubleEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1374,7 +1387,7 @@ public void testDoubleInBloomFilter() throws Exception { List args = new ArrayList(); args.add(15.0); args.add(19.0); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1393,7 +1406,7 @@ public void testDoubleInBloomFilter() throws Exception { @Test public void testStringNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1408,7 +1421,7 @@ public void testStringNullSafeEqualsBloomFilter() throws Exception { @Test public void testStringEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1426,7 +1439,7 @@ public void testStringInBloomFilter() throws Exception { List args = new ArrayList(); args.add("str_15"); args.add("str_19"); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1445,7 +1458,7 @@ public void testStringInBloomFilter() throws Exception { @Test public void testDateWritableNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1461,7 +1474,7 @@ public void testDateWritableNullSafeEqualsBloomFilter() throws Exception { @Test public void testDateWritableEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1480,7 +1493,7 @@ public void testDateWritableInBloomFilter() throws Exception { List args = new ArrayList(); args.add(new DateWritable(15).get()); args.add(new DateWritable(19).get()); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1499,7 +1512,7 @@ public void testDateWritableInBloomFilter() throws Exception { @Test public void testTimestampNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); @@ -1516,7 +1529,7 @@ public void testTimestampNullSafeEqualsBloomFilter() throws Exception { @Test public void testTimestampEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); BloomFilterIO bf = new BloomFilterIO(10000); for (int i = 20; i < 1000; i++) { @@ -1534,7 +1547,7 @@ public void testTimestampInBloomFilter() throws Exception { List args = new ArrayList(); args.add(new Timestamp(15)); args.add(new Timestamp(19)); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1553,7 +1566,7 @@ public void testTimestampInBloomFilter() throws Exception { @Test public void testDecimalNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); @@ -1570,7 +1583,7 @@ public void testDecimalNullSafeEqualsBloomFilter() throws Exception { @Test public void testDecimalEqualsBloomFilter() throws Exception { - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( + PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); @@ -1590,7 +1603,7 @@ public void testDecimalInBloomFilter() throws Exception { List args = new ArrayList(); args.add(new HiveDecimalWritable("15")); args.add(new HiveDecimalWritable("19")); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1613,7 +1626,7 @@ public void testNullsInBloomFilter() throws Exception { args.add(new HiveDecimalWritable("15")); args.add(null); args.add(new HiveDecimalWritable("19")); - PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf + PredicateLeaf pred = createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL, "x", null, args); BloomFilterIO bf = new BloomFilterIO(10000); @@ -1670,7 +1683,7 @@ private void closeMockedRecordReader(DataReader mockedDataReader) throws IOExcep writer.close(); Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); - RecordReader recordReader = reader.rowsOptions(new Reader.Options() + RecordReader recordReader = reader.rows(new Reader.Options() .dataReader(mockedDataReader)); recordReader.close(); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java b/orc/src/test/org/apache/orc/impl/TestStreamName.java similarity index 95% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java rename to orc/src/test/org/apache/orc/impl/TestStreamName.java index dfccd9a3275a..be58d4c3fe98 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java +++ b/orc/src/test/org/apache/orc/impl/TestStreamName.java @@ -16,10 +16,9 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.impl; import org.apache.orc.OrcProto; -import org.apache.orc.impl.StreamName; import org.junit.Test; import static org.junit.Assert.assertEquals; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java b/orc/src/test/org/apache/orc/tools/TestFileDump.java similarity index 54% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java rename to orc/src/test/org/apache/orc/tools/TestFileDump.java index 554033c4492e..ce3381e763f4 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java +++ b/orc/src/test/org/apache/orc/tools/TestFileDump.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.tools; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; @@ -38,13 +38,24 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hive.common.util.HiveTestUtils; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcConf; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -64,70 +75,113 @@ public void openFileSystem () throws Exception { fs.delete(testFilePath, false); } - static class MyRecord { - int i; - long l; - String s; - MyRecord(int i, long l, String s) { - this.i = i; - this.l = l; - this.s = s; + static TypeDescription getMyRecordType() { + return TypeDescription.createStruct() + .addField("i", TypeDescription.createInt()) + .addField("l", TypeDescription.createLong()) + .addField("s", TypeDescription.createString()); + } + + static void appendMyRecord(VectorizedRowBatch batch, + int i, + long l, + String str) { + ((LongColumnVector) batch.cols[0]).vector[batch.size] = i; + ((LongColumnVector) batch.cols[1]).vector[batch.size] = l; + if (str == null) { + batch.cols[2].noNulls = false; + batch.cols[2].isNull[batch.size] = true; + } else { + ((BytesColumnVector) batch.cols[2]).setVal(batch.size, + str.getBytes()); } + batch.size += 1; } - static class AllTypesRecord { - static class Struct { - int i; - String s; + static TypeDescription getAllTypesType() { + return TypeDescription.createStruct() + .addField("b", TypeDescription.createBoolean()) + .addField("bt", TypeDescription.createByte()) + .addField("s", TypeDescription.createShort()) + .addField("i", TypeDescription.createInt()) + .addField("l", TypeDescription.createLong()) + .addField("f", TypeDescription.createFloat()) + .addField("d", TypeDescription.createDouble()) + .addField("de", TypeDescription.createDecimal()) + .addField("t", TypeDescription.createTimestamp()) + .addField("dt", TypeDescription.createDate()) + .addField("str", TypeDescription.createString()) + .addField("c", TypeDescription.createChar().withMaxLength(5)) + .addField("vc", TypeDescription.createVarchar().withMaxLength(10)) + .addField("m", TypeDescription.createMap( + TypeDescription.createString(), + TypeDescription.createString())) + .addField("a", TypeDescription.createList(TypeDescription.createInt())) + .addField("st", TypeDescription.createStruct() + .addField("i", TypeDescription.createInt()) + .addField("s", TypeDescription.createString())); + } - Struct(int i, String s) { - this.i = i; - this.s = s; - } + static void appendAllTypes(VectorizedRowBatch batch, + boolean b, + byte bt, + short s, + int i, + long l, + float f, + double d, + HiveDecimalWritable de, + Timestamp t, + DateWritable dt, + String str, + String c, + String vc, + Map m, + List a, + int sti, + String sts) { + int row = batch.size++; + ((LongColumnVector) batch.cols[0]).vector[row] = b ? 1 : 0; + ((LongColumnVector) batch.cols[1]).vector[row] = bt; + ((LongColumnVector) batch.cols[2]).vector[row] = s; + ((LongColumnVector) batch.cols[3]).vector[row] = i; + ((LongColumnVector) batch.cols[4]).vector[row] = l; + ((DoubleColumnVector) batch.cols[5]).vector[row] = f; + ((DoubleColumnVector) batch.cols[6]).vector[row] = d; + ((DecimalColumnVector) batch.cols[7]).vector[row].set(de); + ((TimestampColumnVector) batch.cols[8]).set(row, t); + ((LongColumnVector) batch.cols[9]).vector[row] = dt.getDays(); + ((BytesColumnVector) batch.cols[10]).setVal(row, str.getBytes()); + ((BytesColumnVector) batch.cols[11]).setVal(row, c.getBytes()); + ((BytesColumnVector) batch.cols[12]).setVal(row, vc.getBytes()); + MapColumnVector map = (MapColumnVector) batch.cols[13]; + int offset = map.childCount; + map.offsets[row] = offset; + map.lengths[row] = m.size(); + map.childCount += map.lengths[row]; + for(Map.Entry entry: m.entrySet()) { + ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes()); + ((BytesColumnVector) map.values).setVal(offset++, + entry.getValue().getBytes()); } - boolean b; - byte bt; - short s; - int i; - long l; - float f; - double d; - HiveDecimal de; - Timestamp t; - Date dt; - String str; - HiveChar c; - HiveVarchar vc; - Map m; - List a; - Struct st; - - AllTypesRecord(boolean b, byte bt, short s, int i, long l, float f, double d, HiveDecimal de, - Timestamp t, Date dt, String str, HiveChar c, HiveVarchar vc, Map m, List a, Struct st) { - this.b = b; - this.bt = bt; - this.s = s; - this.i = i; - this.l = l; - this.f = f; - this.d = d; - this.de = de; - this.t = t; - this.dt = dt; - this.str = str; - this.c = c; - this.vc = vc; - this.m = m; - this.a = a; - this.st = st; + ListColumnVector list = (ListColumnVector) batch.cols[14]; + offset = list.childCount; + list.offsets[row] = offset; + list.lengths[row] = a.size(); + list.childCount += list.lengths[row]; + for(int e=0; e < a.size(); ++e) { + ((LongColumnVector) list.child).vector[offset + e] = a.get(e); } + StructColumnVector struct = (StructColumnVector) batch.cols[15]; + ((LongColumnVector) struct.fields[0]).vector[row] = sti; + ((BytesColumnVector) struct.fields[1]).setVal(row, sts.getBytes()); } - static void checkOutput(String expected, - String actual) throws Exception { + public static void checkOutput(String expected, + String actual) throws Exception { BufferedReader eStream = - new BufferedReader(new FileReader(HiveTestUtils.getFileFromClasspath(expected))); + new BufferedReader(new FileReader + (TestJsonFileDump.getFileFromClasspath(expected))); BufferedReader aStream = new BufferedReader(new FileReader(actual)); String expectedLine = eStream.readLine().trim(); @@ -135,29 +189,24 @@ static void checkOutput(String expected, String actualLine = aStream.readLine().trim(); System.out.println("actual: " + actualLine); System.out.println("expected: " + expectedLine); - assertEquals(expectedLine, actualLine); + Assert.assertEquals(expectedLine, actualLine); expectedLine = eStream.readLine(); expectedLine = expectedLine == null ? null : expectedLine.trim(); } - assertNull(eStream.readLine()); - assertNull(aStream.readLine()); + Assert.assertNull(eStream.readLine()); + Assert.assertNull(aStream.readLine()); eStream.close(); aStream.close(); } @Test public void testDump() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); + TypeDescription schema = getMyRecordType(); + conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .fileSystem(fs) - .inspector(inspector) - .batchSize(1000) + .setSchema(schema) .compress(CompressionKind.ZLIB) .stripeSize(100000) .rowIndexStride(1000)); @@ -173,9 +222,17 @@ public void testDump() throws Exception { "before", "us,", "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; + VectorizedRowBatch batch = schema.createRowBatch(1000); for(int i=0; i < 21000; ++i) { - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), - words[r1.nextInt(words.length)])); + appendMyRecord(batch, r1.nextInt(), r1.nextLong(), + words[r1.nextInt(words.length)]); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size > 0) { + writer.addRowBatch(batch); } writer.close(); PrintStream origOut = System.out; @@ -194,16 +251,19 @@ public void testDump() throws Exception { @Test public void testDataDump() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, - 100000, CompressionKind.NONE, 10000, 1000); + TypeDescription schema = getAllTypesType(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .fileSystem(fs) + .setSchema(schema) + .stripeSize(100000) + .compress(CompressionKind.NONE) + .bufferSize(10000) + .rowIndexStride(1000)); + VectorizedRowBatch batch = schema.createRowBatch(1000); Map m = new HashMap(2); m.put("k1", "v1"); - writer.addRow(new AllTypesRecord( + appendAllTypes(batch, true, (byte) 10, (short) 100, @@ -211,18 +271,19 @@ public void testDataDump() throws Exception { 10000L, 4.0f, 20.0, - HiveDecimal.create("4.2222"), + new HiveDecimalWritable("4.2222"), new Timestamp(1416967764000L), - new Date(1416967764000L), + new DateWritable(new Date(1416967764000L)), "string", - new HiveChar("hello", 5), - new HiveVarchar("hello", 10), + "hello", + "hello", m, Arrays.asList(100, 200), - new AllTypesRecord.Struct(10, "foo"))); + 10, "foo"); m.clear(); m.put("k3", "v3"); - writer.addRow(new AllTypesRecord( + appendAllTypes( + batch, false, (byte)20, (short)200, @@ -230,15 +291,16 @@ public void testDataDump() throws Exception { 20000L, 8.0f, 40.0, - HiveDecimal.create("2.2222"), + new HiveDecimalWritable("2.2222"), new Timestamp(1416967364000L), - new Date(1411967764000L), + new DateWritable(new Date(1411967764000L)), "abcd", - new HiveChar("world", 5), - new HiveVarchar("world", 10), + "world", + "world", m, Arrays.asList(200, 300), - new AllTypesRecord.Struct(20, "bar"))); + 20, "bar"); + writer.addRowBatch(batch); writer.close(); PrintStream origOut = System.out; @@ -249,11 +311,9 @@ public void testDataDump() throws Exception { FileDump.main(new String[]{testFilePath.toString(), "-d"}); System.out.flush(); System.setOut(origOut); - String[] lines = myOut.toString().split("\n"); - // Don't be fooled by the big space in the middle, this line is quite long - assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]); - assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]); + Assert.assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24.0\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello\",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]); + Assert.assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44.0\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world\",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]); } // Test that if the fraction of rows that have distinct strings is greater than the configured @@ -261,23 +321,19 @@ public void testDataDump() throws Exception { // of the dictionary stream for the column will be 0 in the ORC file dump. @Test public void testDictionaryThreshold() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } + TypeDescription schema = getMyRecordType(); Configuration conf = new Configuration(); - conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); - conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f); + conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); + conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .fileSystem(fs) - .batchSize(1000) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.ZLIB) .rowIndexStride(1000) .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(1000); Random r1 = new Random(1); String[] words = new String[]{"It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", @@ -300,8 +356,14 @@ public void testDictionaryThreshold() throws Exception { // the actual string is unique. words[nextInt] += "-" + i; } - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), - words[nextInt])); + appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size != 0) { + writer.addRowBatch(batch); } writer.close(); PrintStream origOut = System.out; @@ -319,20 +381,15 @@ public void testDictionaryThreshold() throws Exception { @Test public void testBloomFilter() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); + TypeDescription schema = getMyRecordType(); + conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); OrcFile.WriterOptions options = OrcFile.writerOptions(conf) .fileSystem(fs) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.ZLIB) .bufferSize(10000) .rowIndexStride(1000) - .batchSize(1000) .bloomFilterColumns("S"); Writer writer = OrcFile.createWriter(testFilePath, options); Random r1 = new Random(1); @@ -347,9 +404,17 @@ public void testBloomFilter() throws Exception { "before", "us,", "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; + VectorizedRowBatch batch = schema.createRowBatch(1000); for(int i=0; i < 21000; ++i) { - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), - words[r1.nextInt(words.length)])); + appendMyRecord(batch, r1.nextInt(), r1.nextLong(), + words[r1.nextInt(words.length)]); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size > 0) { + writer.addRowBatch(batch); } writer.close(); PrintStream origOut = System.out; @@ -368,22 +433,18 @@ public void testBloomFilter() throws Exception { @Test public void testBloomFilter2() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); + TypeDescription schema = getMyRecordType(); + conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); OrcFile.WriterOptions options = OrcFile.writerOptions(conf) .fileSystem(fs) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.ZLIB) .bufferSize(10000) .rowIndexStride(1000) .bloomFilterColumns("l") - .bloomFilterFpp(0.01) - .batchSize(1000); + .bloomFilterFpp(0.01); + VectorizedRowBatch batch = schema.createRowBatch(1000); Writer writer = OrcFile.createWriter(testFilePath, options); Random r1 = new Random(1); String[] words = new String[]{"It", "was", "the", "best", "of", "times,", @@ -398,8 +459,15 @@ public void testBloomFilter2() throws Exception { "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; for(int i=0; i < 21000; ++i) { - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), - words[r1.nextInt(words.length)])); + appendMyRecord(batch, r1.nextInt(), r1.nextLong(), + words[r1.nextInt(words.length)]); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size > 0) { + writer.addRowBatch(batch); } writer.close(); PrintStream origOut = System.out; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java b/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java similarity index 71% rename from ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java rename to orc/src/test/org/apache/orc/tools/TestJsonFileDump.java index acf232de3cf1..a514824f4018 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java +++ b/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.orc; +package org.apache.orc.tools; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; @@ -26,20 +26,31 @@ import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintStream; +import java.net.URL; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hive.common.util.HiveTestUtils; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.CompressionKind; +import org.apache.orc.OrcConf; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; import org.junit.Before; import org.junit.Test; public class TestJsonFileDump { + public static String getFileFromClasspath(String name) { + URL url = ClassLoader.getSystemResource(name); + if (url == null) { + throw new IllegalArgumentException("Could not find " + name); + } + return url.getPath(); + } Path workDir = new Path(System.getProperty("test.tmp.dir")); Configuration conf; @@ -55,21 +66,10 @@ public void openFileSystem () throws Exception { fs.delete(testFilePath, false); } - static class MyRecord { - int i; - long l; - String s; - MyRecord(int i, long l, String s) { - this.i = i; - this.l = l; - this.s = s; - } - } - static void checkOutput(String expected, String actual) throws Exception { BufferedReader eStream = - new BufferedReader(new FileReader(HiveTestUtils.getFileFromClasspath(expected))); + new BufferedReader(new FileReader(getFileFromClasspath(expected))); BufferedReader aStream = new BufferedReader(new FileReader(actual)); String expectedLine = eStream.readLine(); @@ -86,15 +86,14 @@ static void checkOutput(String expected, @Test public void testJsonDump() throws Exception { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); + TypeDescription schema = TypeDescription.createStruct() + .addField("i", TypeDescription.createInt()) + .addField("l", TypeDescription.createLong()) + .addField("s", TypeDescription.createString()); + conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); OrcFile.WriterOptions options = OrcFile.writerOptions(conf) .fileSystem(fs) - .inspector(inspector) + .setSchema(schema) .stripeSize(100000) .compress(CompressionKind.ZLIB) .bufferSize(10000) @@ -113,13 +112,25 @@ public void testJsonDump() throws Exception { "before", "us,", "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; + VectorizedRowBatch batch = schema.createRowBatch(1000); for(int i=0; i < 21000; ++i) { + ((LongColumnVector) batch.cols[0]).vector[batch.size] = r1.nextInt(); + ((LongColumnVector) batch.cols[1]).vector[batch.size] = r1.nextLong(); if (i % 100 == 0) { - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), null)); + batch.cols[2].noNulls = false; + batch.cols[2].isNull[batch.size] = true; } else { - writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), - words[r1.nextInt(words.length)])); + ((BytesColumnVector) batch.cols[2]).setVal(batch.size, + words[r1.nextInt(words.length)].getBytes()); } + batch.size += 1; + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size > 0) { + writer.addRowBatch(batch); } writer.close(); diff --git a/orc/src/test/resources/orc-file-11-format.orc b/orc/src/test/resources/orc-file-11-format.orc new file mode 100644 index 0000000000000000000000000000000000000000..41653c840354a2711a262766ad0f170b34d2062c GIT binary patch literal 373336 zcmeI*Z-^ZCeaG?H*}c2DIq9_8JDo-+RFn%r!4HH^|4tHVEC?a=i31^iZ1!}jZmD!C zk?w)%gByZzTkr!%4~j8K6R2Bgp|wlX5(J5lxG{cEYA9)1T5t<}kdg*$Ts^5sPMtq{ z-jl77y++Z>nb+;@&ToJF+qs#ye&6@!w z=w^S|>}D}6z4YIkaa^8DabAP7SeT5P%THhIgsZ*nlb$S&tQePLcoov7R*&_?>c3tc#WPE%&Ma*_S>7%_dFQKXV`a2A2Y>(q2q1s}0tg_000IagfB*srAbyx(ppLbgL$A1cz_3ZfCqSh2Y7%7cz_3ZfCqSh z2Y7%7cz_3ZfCqSh2Y7%7cz_3ZfCqSh2Y7%7cz_3ZfCqSh2Y7%7cz_3ZfCqSh2Y7%7 zcz_3ZfCqSh2Y7%7cz_3ZfCqSh2Y7%7cz_3ZfCqSh2Y7%7cwkf>$f_?mx|u%)!R%(S z@x0Fg5Az`>pgziZpdQqNdf)>- z%n8VYavrD$^`IX3fDidFCm;{Xd7vKDgL>ctKIFrkfIKMYfqGC6>VXgVkPmYL@}Qgt z>OnoI2R`6KKFkTogK{3I2lb#H_<#@jFee}n%6Xt3)Ps8913u)#oPazi=Ye`q59)yr z_>d2C0`j1o2kJpRs0TjaLq5z2$b)hos0a0+9{7L{`7kFS56XF<9@K++-~&G7!<>LT zDCdECP!H;X5BQJ|a{}_9oCoSbJ*Wph;6pyl3CM$T9;gTPpdR>u5BV@BAP>rUpdQqN zdf)>-%n8VYavrD$^`IX3fDidF zCm;{Xd7vKDgL>ctKIFrkfIKMYfqGC6>Y?yKBW$(;4;YvS%76-(kO`TP37L=ynUD#Y zkO`TP37L=ynUD#YkO`TP37L=ynUD#YkO`TP37L=ynUD#YkO`TP37L=ynUD#YkO`Ua zp_wop3r2f$009(4! z>_3@zt#@fP4V$CAIRFF@KmY**5I_I{1Q0*~0R#|0009ILxLsbPOUI-1nU|k>^~GH; z&1mlAkYZ9XAyQMOa&Dx`=-xVEnQFz7a8&}4a8)Xk@XoNl+s7naVcUxCdmFoEy(CP+ z!+r=NVGppDe!n zYQu*bfP^b-ThaaK8*=#N+mKV{r5O??VG<_c>I*C%nmZqD013MsF?^o69N}`r@ar@y z;R@SUbPrGXqdET*wO=b?D`6{PD`6|)5z3u}NtlF5n1o6A5&W>?a)iqf!+)rCIijMJ zE4qg#JVIFxe@*kF{T-c^u$8ctu$8ct@CfBj!X!+>Buv62{0M$naXG@}h~YmhyBtwb z$`#$i6CR-~U5=<)v3|76F2nO9VGRn1qK{1PNCtqN00v!X#WHlq5{T!z+S>D-==DJv?C&t`SNSCgI@~LBbV^sOTP^ zFbUTPB?*)8@QNVe3Pn_O4^NncYlM=7t0er_5AL7UK9q|3Wf(MqJFP8`2q1s}0tg_0 z00IagfB*srAbeZFi?)BbU8ib?0IRFF@KmY** z5I_I{1Q0*~0R#|0009ILxLsbPOS3C`BfU8Q1Q0*~0R#|0009ILKmY**5I_I{1Q57g zUZmR^o4ByrUF%)xuCJ{0meV*K?acuofB*srAbT9kH*_U zGEr=Pxjvz_xCrIOcnjR`pQ=8i!v44BS{{D#l*@*ndB+2X9xsFF;S!=!M zIgtFtH^QRlK(gKmqu@jDtsLS9Mt!h1?-?t`-Tm5K&*HI8GkSd{ZpY0a2s)El`>|FW z{?}wY8;4oPbk>pehV3^R$shmSo_Mb4ZRfpHQQB^&hm-XAe{RMHivF3re>zIjsGZ%D zG<_xeWBgR{(!u_2ji7Tl8!>-yGCPdy>z&8Cu?#RMaxr^4d-lf&}eeZtzSbVT}_rB=uW1V*N>WjPD=`$}smHg2^{_Mt| zOuWbQ?PJA}bXPiIZ*AvJw%`5w%tAXJyRh0_>vblsbk|o_ddu5KynTEb%l!D_{P^zs zzNxsC<17D=zgU9#drrQr7Cczh`NF=ESix`Pb6fvnP)( zE}b~Nc>2V~lN(PiK6&S>X=7!yHwSyyv_@BvT) z96*A#_CT#qUJ8go>i`QO60{Pu60{PmwFhc_@>YWB;hT@r4A=;=zg_>;Jsc``KluCk zjo{a_eQ3vd1ws0F_7~;nZE-LU)Bz9h01xm05AXmF@Bk0+01xm05AXmF@Bk0+01xm0 z5AXmF@Bk0+01xm05AXmF@Bk0+z^{h~va<+|Zsw0cFuPf7JnwVB!+gjIsE=|Us0a0+ z9{7L{`7kFS56XF<9@K++-~&G7!<>LTDCdECP!H;X5BQJ|a{}_9oCoSbJ*Wph;6pyl z3CM$T9;gTPpdR>u5BV@BAP>rUpdQqNdf)>-%n8VYavrD$^`IX3fDidFCm;{Xd7vKDgL>ctKIFrkfIKMYfqGC6>ftUQ zG{R;p@PL7NpbV&h37L=ynUD#YkO`TP37L=ynUD#YkO`TP37L=yKeh?ev0$_}2Y>(q z2q1s}0tg^*^T900(nc`an*%@q0R#|0009ILxcOifWLwQ}`RR+Ry^SFJc;APDFb!Ly zy*U5`5I_I{1Q0*~0R#|0009ILKmY**5V*DTB3&9hnCpihdoKeUx_y)>h_lS7J0#e_&rnaa75Dx-Vrgk`D~OTtwNM8Z|6Ou{?E`feYSaD{Cvy6Bpwt>_+}@X-2y^nZkvu$8ct z@O`rQ?yC(SY5)?hux&;6qi@LJmv2K(nU`irn1o4~gsU&Gd}!``v;idSa>Vd?=5mC~ z5yP+3tb{9UThTo{;g9D0Pt<;`gsp_Fgsp_FghwcM5+-31CSejL;YaYpipvo$M-2a= z_8TLt<>4>64gFV0_x=&T471Myt8iY-WFeZFi?)BbU8ib?0IRFF@KmY**5I_Kd zn-69|=Km6e00IagfB*srATV+QH?vD}58$>nHgRFKyVkqXU0+%0EvIof+M5GF009IL zKmY**5V-kZ7Gxt|usQ$)5I_I{1Q0*~fnU)jyl>xn{rk;t2Kn7&Mff4G~Vn7CDKnARv_Xg*CbI{a!@c%Y? zBM3g;_n{z2pZnok&)+!n^>X7G5;xA61!jR+z>0Y=Z~+&X1zf-d*2z92NJD8T4W*$p zl!nq!8cIWHC=I2dG?a$YP#Q`@X($b)p){0+(oh;oLun`trJ*#GhSE?PN<(QV4W*$p zl!nq!8cIWHC=I2dG?a$YP#Q`@X($b)p){0+ziw%`5lnx)??b_KJRXg=gJh!EHj+-U z4U@TI+f0&T+e!`=+p*-+#dbV7Qfw#s+bG#pJfG~pFYfPeDw!(!clDp|PG*be)BXMK z>A&9YzrHs)RJ`8l?`Nj}`fR^{Uw_Hf&+?2z+)?b+Vf-sIiiDTa2Ee>ztT?Ih#HPCLn- zV(85F(An)DJKNvcY{of1uet3J`?p8z?~mBuACV;QcC($1wUg`!^V2(!{Ok`JMbClc zbDb~>KJ?zo-W7ge)CYU>p0Q%w-LKvCEMxYAx8~;eWZS=OCBJ-j&*A@RefHTS^LwN3 zJin*?H?8EEA59$oY3sAE&n*02w5v1S2%D|3@rh_M-ZPUNn2(mPUcG#=yS6w#w*2(< zi;MH)*H*87?fT+;tGAXtj$OXGb}8%p?aNoY+0%)YtLw{`^HKTWYgxy&>tE{?Bd+vT zRxV$hZ(Z(PyFTB%(*51}@cOm+XyxLC-j(j<#f1z^c5h_&FuOOidn>z-W%u#y9%c85 zg{kZS8&l;z7RI8u)7Z7JFY60;Pxp^!`{!lD^Ic4oquS-Dy`8F2mww10%jIjk^Ha}%YFzx(`wjigZ0PYbaeMLj ziIb;JFP%Ah`qas#tkrpc&(eJ7?D13C_nEV27mppyS|^rHoqzs`=npg9E`-6<*KakO KU;fG$+W!y8RG5AM literal 0 HcmV?d00001 diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter.out b/orc/src/test/resources/orc-file-dump-bloomfilter.out similarity index 100% rename from ql/src/test/resources/orc-file-dump-bloomfilter.out rename to orc/src/test/resources/orc-file-dump-bloomfilter.out diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter2.out b/orc/src/test/resources/orc-file-dump-bloomfilter2.out similarity index 100% rename from ql/src/test/resources/orc-file-dump-bloomfilter2.out rename to orc/src/test/resources/orc-file-dump-bloomfilter2.out diff --git a/ql/src/test/resources/orc-file-dump-dictionary-threshold.out b/orc/src/test/resources/orc-file-dump-dictionary-threshold.out similarity index 100% rename from ql/src/test/resources/orc-file-dump-dictionary-threshold.out rename to orc/src/test/resources/orc-file-dump-dictionary-threshold.out diff --git a/ql/src/test/resources/orc-file-dump.json b/orc/src/test/resources/orc-file-dump.json similarity index 100% rename from ql/src/test/resources/orc-file-dump.json rename to orc/src/test/resources/orc-file-dump.json diff --git a/ql/src/test/resources/orc-file-dump.out b/orc/src/test/resources/orc-file-dump.out similarity index 100% rename from ql/src/test/resources/orc-file-dump.out rename to orc/src/test/resources/orc-file-dump.out diff --git a/ql/src/test/resources/orc-file-has-null.out b/orc/src/test/resources/orc-file-has-null.out similarity index 100% rename from ql/src/test/resources/orc-file-has-null.out rename to orc/src/test/resources/orc-file-has-null.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java index 6225adeaefb3..8963449a13a2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java @@ -20,12 +20,9 @@ import java.sql.Timestamp; -import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.ql.util.TimestampUtils; /** * Type cast decimal to timestamp. The decimal value is interpreted @@ -44,6 +41,7 @@ public CastDecimalToTimestamp() { @Override protected void func(TimestampColumnVector outV, DecimalColumnVector inV, int i) { - outV.set(i, TimestampWritable.decimalToTimestamp(inV.vector[i].getHiveDecimal())); + Timestamp timestamp = TimestampUtils.decimalToTimestamp(inV.vector[i].getHiveDecimal()); + outV.set(i, timestamp); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java index 31d2f783d8f2..07f94f57da7e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; -import org.apache.hadoop.hive.ql.exec.vector.*; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.util.TimestampUtils; public class CastDoubleToTimestamp extends VectorExpression { private static final long serialVersionUID = 1L; @@ -40,9 +42,8 @@ public CastDoubleToTimestamp() { private void setDouble(TimestampColumnVector timestampColVector, double[] vector, int elementNum) { - TimestampWritable.setTimestampFromDouble( - timestampColVector.getScratchTimestamp(), vector[elementNum]); - timestampColVector.setFromScratchTimestamp(elementNum); + timestampColVector.set(elementNum, + TimestampUtils.doubleToTimestamp(vector[elementNum])); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java index a2ee52db11e2..4de95a575382 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java @@ -39,9 +39,7 @@ public CastLongToTimestamp() { } private void setSeconds(TimestampColumnVector timestampColVector, long[] vector, int elementNum) { - TimestampWritable.setTimestampFromLong( - timestampColVector.getScratchTimestamp(), vector[elementNum], - /* intToTimestampInSeconds */ true); + timestampColVector.getScratchTimestamp().setTime(vector[elementNum] * 1000); timestampColVector.setFromScratchTimestamp(elementNum); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java index 01c8810cfa06..b1c6b2de7fe4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java @@ -38,10 +38,9 @@ public CastMillisecondsLongToTimestamp() { super(); } - private void setMilliseconds(TimestampColumnVector timestampColVector, long[] vector, int elementNum) { - TimestampWritable.setTimestampFromLong( - timestampColVector.getScratchTimestamp(), vector[elementNum], - /* intToTimestampInSeconds */ false); + private void setMilliseconds(TimestampColumnVector timestampColVector, + long[] vector, int elementNum) { + timestampColVector.getScratchTimestamp().setTime(vector[elementNum]); timestampColVector.setFromScratchTimestamp(elementNum); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java index d5d13708834b..e184fcb8fa72 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java @@ -30,8 +30,8 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.FetchTask; -import org.apache.hadoop.hive.ql.io.FileFormatException; -import org.apache.hadoop.hive.ql.io.orc.FileDump; +import org.apache.orc.FileFormatException; +import org.apache.orc.tools.FileDump; import org.apache.hadoop.hive.ql.io.orc.OrcFile; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.session.SessionState; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java index 0dd58b700aee..b9094bf7efc0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java @@ -18,10 +18,7 @@ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; -import java.util.ArrayDeque; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Deque; import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -29,22 +26,20 @@ import org.apache.orc.OrcUtils; import org.apache.orc.StripeInformation; import org.apache.orc.TypeDescription; +import org.apache.orc.impl.AcidStats; +import org.apache.orc.impl.OrcAcidUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.ValidTxnList; -import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; @@ -494,7 +489,7 @@ static Reader.Options createEventOptions(Reader.Options options) { Path deltaFile = AcidUtils.createBucketFile(delta, bucket); AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta); FileSystem fs = deltaFile.getFileSystem(conf); - long length = getLastFlushLength(fs, deltaFile); + long length = OrcAcidUtils.getLastFlushLength(fs, deltaFile); if (length != -1 && fs.exists(deltaFile)) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); @@ -504,7 +499,7 @@ static Reader.Options createEventOptions(Reader.Options options) { // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) // unless the delta only has insert events - OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader); + AcidStats acidStats = OrcAcidUtils.parseAcidStats(deltaReader); if(acidStats.deletes > 0 || acidStats.updates > 0) { deltaEventOptions = eventOptions.clone().searchArgument(null, null); } @@ -536,28 +531,6 @@ static Reader.Options createEventOptions(Reader.Options options) { } } - /** - * Read the side file to get the last flush length. - * @param fs the file system to use - * @param deltaFile the path of the delta file - * @return the maximum size of the file to use - * @throws IOException - */ - static long getLastFlushLength(FileSystem fs, - Path deltaFile) throws IOException { - Path lengths = OrcRecordUpdater.getSideFile(deltaFile); - long result = Long.MAX_VALUE; - try (FSDataInputStream stream = fs.open(lengths)) { - result = -1; - while (stream.available() > 0) { - result = stream.readLong(); - } - return result; - } catch (IOException ioe) { - return result; - } - } - @VisibleForTesting RecordIdentifier getMinKey() { return minKey; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java index d085c58b908f..4bf2403704f4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java @@ -25,6 +25,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.orc.impl.AcidStats; +import org.apache.orc.impl.OrcAcidUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -55,7 +57,6 @@ public class OrcRecordUpdater implements RecordUpdater { public static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; public static final String ACID_FORMAT = "_orc_acid_version"; - public static final String ACID_STATS = "hive.acid.stats"; public static final int ORC_ACID_VERSION = 0; @@ -102,46 +103,6 @@ public class OrcRecordUpdater implements RecordUpdater { private LongObjectInspector origTxnInspector; // OI for the original txn inside the record // identifer - static class AcidStats { - long inserts; - long updates; - long deletes; - - AcidStats() { - // nothing - } - - AcidStats(String serialized) { - String[] parts = serialized.split(","); - inserts = Long.parseLong(parts[0]); - updates = Long.parseLong(parts[1]); - deletes = Long.parseLong(parts[2]); - } - - String serialize() { - StringBuilder builder = new StringBuilder(); - builder.append(inserts); - builder.append(","); - builder.append(updates); - builder.append(","); - builder.append(deletes); - return builder.toString(); - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append(" inserts: ").append(inserts); - builder.append(" updates: ").append(updates); - builder.append(" deletes: ").append(deletes); - return builder.toString(); - } - } - - public static Path getSideFile(Path main) { - return new Path(main + AcidUtils.DELTA_SIDE_FILE_SUFFIX); - } - static int getOperation(OrcStruct struct) { return ((IntWritable) struct.getFieldValue(OPERATION)).get(); } @@ -237,7 +198,7 @@ static StructObjectInspector createEventSchema(ObjectInspector rowInspector) { } if (options.getMinimumTransactionId() != options.getMaximumTransactionId() && !options.isWritingBase()){ - flushLengths = fs.create(getSideFile(this.path), true, 8, + flushLengths = fs.create(OrcAcidUtils.getSideFile(this.path), true, 8, options.getReporter()); } else { flushLengths = null; @@ -297,7 +258,7 @@ private long findRowIdOffsetForInsert() throws IOException { } Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration())); //no close() on Reader?! - AcidStats acidStats = parseAcidStats(reader); + AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); if(acidStats.inserts > 0) { return acidStats.inserts; } @@ -412,7 +373,7 @@ public void close(boolean abort) throws IOException { } if (flushLengths != null) { flushLengths.close(); - fs.delete(getSideFile(path), false); + fs.delete(OrcAcidUtils.getSideFile(path), false); } writer = null; } @@ -456,26 +417,6 @@ static RecordIdentifier[] parseKeyIndex(Reader reader) { } return result; } - /** - * {@link KeyIndexBuilder} creates these - */ - static AcidStats parseAcidStats(Reader reader) { - if (reader.hasMetadataValue(OrcRecordUpdater.ACID_STATS)) { - String statsSerialized; - try { - ByteBuffer val = - reader.getMetadataValue(OrcRecordUpdater.ACID_STATS) - .duplicate(); - statsSerialized = utf8Decoder.decode(val).toString(); - } catch (CharacterCodingException e) { - throw new IllegalArgumentException("Bad string encoding for " + - OrcRecordUpdater.ACID_STATS, e); - } - return new AcidStats(statsSerialized); - } else { - return null; - } - } static class KeyIndexBuilder implements OrcFile.WriterCallback { StringBuilder lastKey = new StringBuilder(); @@ -500,7 +441,7 @@ public void preFooterWrite(OrcFile.WriterContext context ) throws IOException { context.getWriter().addUserMetadata(ACID_KEY_INDEX_NAME, UTF8.encode(lastKey.toString())); - context.getWriter().addUserMetadata(ACID_STATS, + context.getWriter().addUserMetadata(OrcAcidUtils.ACID_STATS, UTF8.encode(acidStats.serialize())); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index b7437bed7716..3a2e7d8049a3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -22,17 +22,9 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; import java.util.List; -import java.util.Set; -import com.google.common.collect.Lists; -import org.apache.orc.OrcUtils; -import org.apache.orc.TypeDescription; import org.apache.orc.impl.BufferChunk; -import org.apache.orc.ColumnStatistics; -import org.apache.orc.impl.ColumnStatisticsImpl; import org.apache.orc.CompressionCodec; import org.apache.orc.FileMetaInfo; import org.apache.orc.FileMetadata; @@ -41,47 +33,25 @@ import org.apache.orc.StripeStatistics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.io.DiskRange; -import org.apache.hadoop.hive.ql.io.FileFormatException; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.io.Text; import org.apache.orc.OrcProto; +import com.google.common.collect.Lists; import com.google.protobuf.CodedInputStream; -public class ReaderImpl implements Reader { +public class ReaderImpl extends org.apache.orc.impl.ReaderImpl + implements Reader { private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class); private static final int DIRECTORY_SIZE_GUESS = 16 * 1024; - protected final FileSystem fileSystem; - private final long maxLength; - protected final Path path; - protected final org.apache.orc.CompressionKind compressionKind; - protected final CompressionCodec codec; - protected final int bufferSize; - private final List stripeStats; - private final int metadataSize; - protected final List types; - private final TypeDescription schema; - private final List userMetadata; - private final List fileStats; - private final List stripes; - protected final int rowIndexStride; - private final long contentLength, numberOfRows; - private final ObjectInspector inspector; - private long deserializedSize = -1; - protected final Configuration conf; - private final List versionList; - private final OrcFile.WriterVersion writerVersion; //serialized footer - Keeping this around for use by getFileMetaInfo() // will help avoid cpu cycles spend in deserializing at cost of increased @@ -91,83 +61,9 @@ public class ReaderImpl implements Reader { // This will only be set if the file footer/metadata was read from disk. private final ByteBuffer footerMetaAndPsBuffer; - public static class StripeInformationImpl - implements StripeInformation { - private final OrcProto.StripeInformation stripe; - - public StripeInformationImpl(OrcProto.StripeInformation stripe) { - this.stripe = stripe; - } - - @Override - public long getOffset() { - return stripe.getOffset(); - } - - @Override - public long getLength() { - return stripe.getDataLength() + getIndexLength() + getFooterLength(); - } - - @Override - public long getDataLength() { - return stripe.getDataLength(); - } - - @Override - public long getFooterLength() { - return stripe.getFooterLength(); - } - - @Override - public long getIndexLength() { - return stripe.getIndexLength(); - } - - @Override - public long getNumberOfRows() { - return stripe.getNumberOfRows(); - } - - @Override - public String toString() { - return "offset: " + getOffset() + " data: " + getDataLength() + - " rows: " + getNumberOfRows() + " tail: " + getFooterLength() + - " index: " + getIndexLength(); - } - } - @Override - public long getNumberOfRows() { - return numberOfRows; - } - - @Override - public List getMetadataKeys() { - List result = new ArrayList(); - for(OrcProto.UserMetadataItem item: userMetadata) { - result.add(item.getName()); - } - return result; - } - - @Override - public ByteBuffer getMetadataValue(String key) { - for(OrcProto.UserMetadataItem item: userMetadata) { - if (item.hasName() && item.getName().equals(key)) { - return item.getValue().asReadOnlyByteBuffer(); - } - } - throw new IllegalArgumentException("Can't find user metadata " + key); - } - - public boolean hasMetadataValue(String key) { - for(OrcProto.UserMetadataItem item: userMetadata) { - if (item.hasName() && item.getName().equals(key)) { - return true; - } - } - return false; + public ObjectInspector getObjectInspector() { + return inspector; } @Override @@ -181,181 +77,19 @@ public org.apache.hadoop.hive.ql.io.orc.CompressionKind getCompression() { compressionKind); } - @Override - public org.apache.orc.CompressionKind getCompressionKind() { - return compressionKind; - } - - @Override - public int getCompressionSize() { - return bufferSize; - } - - @Override - public List getStripes() { - return stripes; - } - - @Override - public ObjectInspector getObjectInspector() { - return inspector; - } - - @Override - public long getContentLength() { - return contentLength; - } - - @Override - public List getTypes() { - return types; - } - - @Override - public OrcFile.Version getFileVersion() { - for (OrcFile.Version version: OrcFile.Version.values()) { - if ((versionList != null && !versionList.isEmpty()) && - version.getMajor() == versionList.get(0) && - version.getMinor() == versionList.get(1)) { - return version; - } - } - return OrcFile.Version.V_0_11; - } - - @Override - public OrcFile.WriterVersion getWriterVersion() { - return writerVersion; - } - - @Override - public int getRowIndexStride() { - return rowIndexStride; - } - - @Override - public ColumnStatistics[] getStatistics() { - ColumnStatistics[] result = new ColumnStatistics[types.size()]; - for(int i=0; i < result.length; ++i) { - result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i)); - } - return result; - } - - @Override - public TypeDescription getSchema() { - return schema; - } - - /** - * Ensure this is an ORC file to prevent users from trying to read text - * files or RC files as ORC files. - * @param in the file being read - * @param path the filename for error messages - * @param psLen the postscript length - * @param buffer the tail of the file - * @throws IOException - */ - static void ensureOrcFooter(FSDataInputStream in, - Path path, - int psLen, - ByteBuffer buffer) throws IOException { - int magicLength = OrcFile.MAGIC.length(); - int fullLength = magicLength + 1; - if (psLen < fullLength || buffer.remaining() < fullLength) { - throw new FileFormatException("Malformed ORC file " + path + - ". Invalid postscript length " + psLen); - } - int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength; - byte[] array = buffer.array(); - // now look for the magic string at the end of the postscript. - if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) { - // If it isn't there, this may be the 0.11.0 version of ORC. - // Read the first 3 bytes of the file to check for the header - byte[] header = new byte[magicLength]; - in.readFully(0, header, 0, magicLength); - // if it isn't there, this isn't an ORC file - if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) { - throw new FileFormatException("Malformed ORC file " + path + - ". Invalid postscript."); - } - } - } - - /** - * Build a version string out of an array. - * @param version the version number as a list - * @return the human readable form of the version string - */ - private static String versionString(List version) { - StringBuilder buffer = new StringBuilder(); - for(int i=0; i < version.size(); ++i) { - if (i != 0) { - buffer.append('.'); - } - buffer.append(version.get(i)); - } - return buffer.toString(); - } - - /** - * Check to see if this ORC file is from a future version and if so, - * warn the user that we may not be able to read all of the column encodings. - * @param log the logger to write any error message to - * @param path the data source path for error messages - * @param version the version of hive that wrote the file. - */ - static void checkOrcVersion(Logger log, Path path, List version) { - if (version.size() >= 1) { - int major = version.get(0); - int minor = 0; - if (version.size() >= 2) { - minor = version.get(1); - } - if (major > OrcFile.Version.CURRENT.getMajor() || - (major == OrcFile.Version.CURRENT.getMajor() && - minor > OrcFile.Version.CURRENT.getMinor())) { - log.warn(path + " was written by a future Hive version " + - versionString(version) + - ". This file may not be readable by this version of Hive."); - } - } - } - /** * Constructor that let's the user specify additional options. * @param path pathname for file * @param options options for reading * @throws IOException */ - public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { - FileSystem fs = options.getFilesystem(); - if (fs == null) { - fs = path.getFileSystem(options.getConfiguration()); - } - this.fileSystem = fs; - this.path = path; - this.conf = options.getConfiguration(); - this.maxLength = options.getMaxLength(); - + public ReaderImpl(Path path, + OrcFile.ReaderOptions options) throws IOException { + super(path, options); FileMetadata fileMetadata = options.getFileMetadata(); if (fileMetadata != null) { - this.compressionKind = fileMetadata.getCompressionKind(); - this.bufferSize = fileMetadata.getCompressionBufferSize(); - this.codec = WriterImpl.createCodec(compressionKind); - this.metadataSize = fileMetadata.getMetadataSize(); - this.stripeStats = fileMetadata.getStripeStats(); - this.versionList = fileMetadata.getVersionList(); - this.writerVersion = OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum()); - this.types = fileMetadata.getTypes(); - this.rowIndexStride = fileMetadata.getRowIndexStride(); - this.contentLength = fileMetadata.getContentLength(); - this.numberOfRows = fileMetadata.getNumberOfRows(); - this.fileStats = fileMetadata.getFileStats(); - this.stripes = fileMetadata.getStripes(); this.inspector = OrcStruct.createObjectInspector(0, fileMetadata.getTypes()); this.footerByteBuffer = null; // not cached and not needed here - this.userMetadata = null; // not cached and not needed here this.footerMetaAndPsBuffer = null; } else { FileMetaInfo footerMetaData; @@ -363,7 +97,7 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { footerMetaData = options.getFileMetaInfo(); this.footerMetaAndPsBuffer = null; } else { - footerMetaData = extractMetaInfoFromFooter(fs, path, + footerMetaData = extractMetaInfoFromFooter(fileSystem, path, options.getMaxLength()); this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; } @@ -374,37 +108,8 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { footerMetaData.footerBuffer ); this.footerByteBuffer = footerMetaData.footerBuffer; - this.compressionKind = rInfo.compressionKind; - this.codec = rInfo.codec; - this.bufferSize = rInfo.bufferSize; - this.metadataSize = rInfo.metadataSize; - this.stripeStats = rInfo.metadata.getStripeStatsList(); - this.types = rInfo.footer.getTypesList(); - this.rowIndexStride = rInfo.footer.getRowIndexStride(); - this.contentLength = rInfo.footer.getContentLength(); - this.numberOfRows = rInfo.footer.getNumberOfRows(); - this.userMetadata = rInfo.footer.getMetadataList(); - this.fileStats = rInfo.footer.getStatisticsList(); this.inspector = rInfo.inspector; - this.versionList = footerMetaData.versionList; - this.writerVersion = footerMetaData.writerVersion; - this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList()); } - this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0); - } - - /** - * Get the WriterVersion based on the ORC file postscript. - * @param writerVersion the integer writer version - * @return the writer version of the file - */ - static OrcFile.WriterVersion getWriterVersion(int writerVersion) { - for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) { - if (version.getId() == writerVersion) { - return version; - } - } - return OrcFile.WriterVersion.FUTURE; } /** Extracts the necessary metadata from an externally store buffer (fullFooterBuffer). */ @@ -565,20 +270,6 @@ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, ); } - private static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) { - return (ps.hasWriterVersion() - ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL); - } - - private static List convertProtoStripesToStripes( - List stripes) { - List result = new ArrayList(stripes.size()); - for (OrcProto.StripeInformation info : stripes) { - result.add(new StripeInformationImpl(info)); - } - return result; - } - /** * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl * from serialized fields. @@ -617,7 +308,8 @@ private static class MetaInfoObjExtractor{ public FileMetaInfo getFileMetaInfo() { return new FileMetaInfo(compressionKind.toString(), bufferSize, - metadataSize, footerByteBuffer, versionList, writerVersion, footerMetaAndPsBuffer); + getMetadataSize(), footerByteBuffer, getVersionList(), + getWriterVersion(), footerMetaAndPsBuffer); } /** Same as FileMetaInfo, but with extra fields. FileMetaInfo is serialized for splits @@ -696,185 +388,8 @@ public RecordReader rows(long offset, long length, boolean[] include, .searchArgument(sarg, columnNames)); } - @Override - public long getRawDataSize() { - // if the deserializedSize is not computed, then compute it, else - // return the already computed size. since we are reading from the footer - // we don't have to compute deserialized size repeatedly - if (deserializedSize == -1) { - List indices = Lists.newArrayList(); - for (int i = 0; i < fileStats.size(); ++i) { - indices.add(i); - } - deserializedSize = getRawDataSizeFromColIndices(indices); - } - return deserializedSize; - } - - @Override - public long getRawDataSizeFromColIndices(List colIndices) { - return getRawDataSizeFromColIndices(colIndices, types, fileStats); - } - - public static long getRawDataSizeFromColIndices( - List colIndices, List types, - List stats) { - long result = 0; - for (int colIdx : colIndices) { - result += getRawDataSizeOfColumn(colIdx, types, stats); - } - return result; - } - - private static long getRawDataSizeOfColumn(int colIdx, List types, - List stats) { - OrcProto.ColumnStatistics colStat = stats.get(colIdx); - long numVals = colStat.getNumberOfValues(); - OrcProto.Type type = types.get(colIdx); - - switch (type.getKind()) { - case BINARY: - // old orc format doesn't support binary statistics. checking for binary - // statistics is not required as protocol buffers takes care of it. - return colStat.getBinaryStatistics().getSum(); - case STRING: - case CHAR: - case VARCHAR: - // old orc format doesn't support sum for string statistics. checking for - // existence is not required as protocol buffers takes care of it. - - // ORC strings are deserialized to java strings. so use java data model's - // string size - numVals = numVals == 0 ? 1 : numVals; - int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals); - return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen); - case TIMESTAMP: - return numVals * JavaDataModel.get().lengthOfTimestamp(); - case DATE: - return numVals * JavaDataModel.get().lengthOfDate(); - case DECIMAL: - return numVals * JavaDataModel.get().lengthOfDecimal(); - case DOUBLE: - case LONG: - return numVals * JavaDataModel.get().primitive2(); - case FLOAT: - case INT: - case SHORT: - case BOOLEAN: - case BYTE: - return numVals * JavaDataModel.get().primitive1(); - default: - LOG.debug("Unknown primitive category: " + type.getKind()); - break; - } - - return 0; - } - - @Override - public long getRawDataSizeOfColumns(List colNames) { - List colIndices = getColumnIndicesFromNames(colNames); - return getRawDataSizeFromColIndices(colIndices); - } - - private List getColumnIndicesFromNames(List colNames) { - // top level struct - OrcProto.Type type = types.get(0); - List colIndices = Lists.newArrayList(); - List fieldNames = type.getFieldNamesList(); - int fieldIdx = 0; - for (String colName : colNames) { - if (fieldNames.contains(colName)) { - fieldIdx = fieldNames.indexOf(colName); - } else { - String s = "Cannot find field for: " + colName + " in "; - for (String fn : fieldNames) { - s += fn + ", "; - } - LOG.warn(s); - continue; - } - - // a single field may span multiple columns. find start and end column - // index for the requested field - int idxStart = type.getSubtypes(fieldIdx); - - int idxEnd; - - // if the specified is the last field and then end index will be last - // column index - if (fieldIdx + 1 > fieldNames.size() - 1) { - idxEnd = getLastIdx() + 1; - } else { - idxEnd = type.getSubtypes(fieldIdx + 1); - } - - // if start index and end index are same then the field is a primitive - // field else complex field (like map, list, struct, union) - if (idxStart == idxEnd) { - // simple field - colIndices.add(idxStart); - } else { - // complex fields spans multiple columns - for (int i = idxStart; i < idxEnd; i++) { - colIndices.add(i); - } - } - } - return colIndices; - } - - private int getLastIdx() { - Set indices = new HashSet<>(); - for (OrcProto.Type type : types) { - indices.addAll(type.getSubtypesList()); - } - return Collections.max(indices); - } - - @Override - public List getOrcProtoStripeStatistics() { - return stripeStats; - } - - @Override - public List getOrcProtoFileStatistics() { - return fileStats; - } - - @Override - public List getStripeStatistics() { - List result = new ArrayList<>(); - for (OrcProto.StripeStatistics ss : stripeStats) { - result.add(new StripeStatistics(ss.getColStatsList())); - } - return result; - } - - public List getOrcProtoUserMetadata() { - return userMetadata; - } - - @Override - public List getVersionList() { - return versionList; - } - - @Override - public int getMetadataSize() { - return metadataSize; - } - @Override public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append("ORC Reader("); - buffer.append(path); - if (maxLength != -1) { - buffer.append(", "); - buffer.append(maxLength); - } - buffer.append(")"); - return buffer.toString(); + return "Hive " + super.toString(); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 2199b117cce5..e46ca51eff9c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -18,1218 +18,923 @@ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; -import java.math.BigDecimal; -import java.sql.Date; -import java.sql.Timestamp; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.Map; -import org.apache.hadoop.fs.FileSystem; -import org.apache.orc.BooleanColumnStatistics; -import org.apache.orc.impl.BufferChunk; -import org.apache.orc.ColumnStatistics; -import org.apache.orc.impl.ColumnStatisticsImpl; -import org.apache.orc.CompressionCodec; -import org.apache.orc.DataReader; -import org.apache.orc.DateColumnStatistics; -import org.apache.orc.DecimalColumnStatistics; -import org.apache.orc.DoubleColumnStatistics; -import org.apache.orc.impl.DataReaderProperties; -import org.apache.orc.impl.InStream; -import org.apache.orc.IntegerColumnStatistics; -import org.apache.orc.OrcConf; -import org.apache.orc.impl.OrcIndex; -import org.apache.orc.impl.PositionProvider; -import org.apache.orc.impl.StreamName; -import org.apache.orc.StringColumnStatistics; -import org.apache.orc.StripeInformation; -import org.apache.orc.TimestampColumnStatistics; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.io.DiskRange; -import org.apache.hadoop.hive.common.io.DiskRangeList; -import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.BloomFilterIO; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; +import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import org.apache.orc.OrcProto; +import org.apache.orc.TypeDescription; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -public class RecordReaderImpl implements RecordReader { +public class RecordReaderImpl extends org.apache.orc.impl.RecordReaderImpl + implements RecordReader { static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class); - private static final boolean isLogDebugEnabled = LOG.isDebugEnabled(); - private static final Object UNKNOWN_VALUE = new Object(); - private final Path path; - private final long firstRow; - private final List stripes = - new ArrayList(); - private OrcProto.StripeFooter stripeFooter; - private final long totalRowCount; - private final CompressionCodec codec; - private final List types; - private final int bufferSize; - private final boolean[] included; - private final long rowIndexStride; - private long rowInStripe = 0; - private int currentStripe = -1; - private long rowBaseInStripe = 0; - private long rowCountInStripe = 0; - private final Map streams = - new HashMap(); - DiskRangeList bufferChunks = null; - private final TreeReaderFactory.TreeReader reader; - private final OrcProto.RowIndex[] indexes; - private final OrcProto.BloomFilterIndex[] bloomFilterIndices; - private final SargApplier sargApp; - // an array about which row groups aren't skipped - private boolean[] includedRowGroups = null; - private final DataReader dataReader; + private final VectorizedRowBatch batch; + private int rowInBatch; + private long baseRow; - /** - * Given a list of column names, find the given column and return the index. - * - * @param columnNames the list of potential column names - * @param columnName the column name to look for - * @param rootColumn offset the result with the rootColumn - * @return the column number or -1 if the column wasn't found - */ - static int findColumns(String[] columnNames, - String columnName, - int rootColumn) { - for(int i=0; i < columnNames.length; ++i) { - if (columnName.equals(columnNames[i])) { - return i + rootColumn; - } - } - return -1; + protected RecordReaderImpl(ReaderImpl fileReader, + Reader.Options options) throws IOException { + super(fileReader, options); + batch = this.schema.createRowBatch(); + rowInBatch = 0; } /** - * Find the mapping from predicate leaves to columns. - * @param sargLeaves the search argument that we need to map - * @param columnNames the names of the columns - * @param rootColumn the offset of the top level row, which offsets the - * result - * @return an array mapping the sarg leaves to concrete column numbers + * If the current batch is empty, get a new one. + * @return true if we have rows available. + * @throws IOException */ - public static int[] mapSargColumnsToOrcInternalColIdx(List sargLeaves, - String[] columnNames, - int rootColumn) { - int[] result = new int[sargLeaves.size()]; - Arrays.fill(result, -1); - for(int i=0; i < result.length; ++i) { - String colName = sargLeaves.get(i).getColumnName(); - result[i] = findColumns(columnNames, colName, rootColumn); + boolean ensureBatch() throws IOException { + if (rowInBatch >= batch.size) { + baseRow = super.getRowNumber(); + rowInBatch = 0; + return super.nextBatch(batch); } - return result; + return true; } - protected RecordReaderImpl(ReaderImpl fileReader, - Reader.Options options) throws IOException { - SchemaEvolution treeReaderSchema; - this.included = options.getInclude(); - included[0] = true; - if (options.getSchema() == null) { - if (LOG.isInfoEnabled()) { - LOG.info("Schema on read not provided -- using file schema " + - fileReader.getSchema()); - } - treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), included); - } else { + @Override + public long getRowNumber() { + return baseRow + rowInBatch; + } - // Now that we are creating a record reader for a file, validate that the schema to read - // is compatible with the file schema. - // - treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), - options.getSchema(), - included); - } - this.path = fileReader.path; - this.codec = fileReader.codec; - this.types = fileReader.types; - this.bufferSize = fileReader.bufferSize; - this.rowIndexStride = fileReader.rowIndexStride; - FileSystem fileSystem = fileReader.fileSystem; - SearchArgument sarg = options.getSearchArgument(); - if (sarg != null && rowIndexStride != 0) { - sargApp = new SargApplier( - sarg, options.getColumnNames(), rowIndexStride, types, included.length); - } else { - sargApp = null; - } - long rows = 0; - long skippedRows = 0; - long offset = options.getOffset(); - long maxOffset = options.getMaxOffset(); - for(StripeInformation stripe: fileReader.getStripes()) { - long stripeStart = stripe.getOffset(); - if (offset > stripeStart) { - skippedRows += stripe.getNumberOfRows(); - } else if (stripeStart < maxOffset) { - this.stripes.add(stripe); - rows += stripe.getNumberOfRows(); - } - } + @Override + public boolean hasNext() throws IOException { + return ensureBatch(); + } - Boolean zeroCopy = options.getUseZeroCopy(); - if (zeroCopy == null) { - zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf); - } - if (options.getDataReader() == null) { - dataReader = RecordReaderUtils.createDefaultDataReader( - DataReaderProperties.builder() - .withBufferSize(bufferSize) - .withCompression(fileReader.compressionKind) - .withFileSystem(fileSystem) - .withPath(path) - .withTypeCount(types.size()) - .withZeroCopy(zeroCopy) - .build()); + @Override + public void seekToRow(long row) throws IOException { + if (row >= baseRow && row < baseRow + batch.size) { + rowInBatch = (int) (row - baseRow); } else { - dataReader = options.getDataReader(); + super.seekToRow(row); + batch.size = 0; + ensureBatch(); } - firstRow = skippedRows; - totalRowCount = rows; - Boolean skipCorrupt = options.getSkipCorruptRecords(); - if (skipCorrupt == null) { - skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf); - } - - reader = TreeReaderFactory.createTreeReader(treeReaderSchema.getReaderSchema(), - treeReaderSchema, included, skipCorrupt); - indexes = new OrcProto.RowIndex[types.size()]; - bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()]; - advanceToNextRow(reader, 0L, true); } - public static final class PositionProviderImpl implements PositionProvider { - private final OrcProto.RowIndexEntry entry; - private int index; - - public PositionProviderImpl(OrcProto.RowIndexEntry entry) { - this(entry, 0); + @Override + public Object next(Object previous) throws IOException { + if (!ensureBatch()) { + return null; } - - public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) { - this.entry = entry; - this.index = startPos; + if (schema.getCategory() == TypeDescription.Category.STRUCT) { + OrcStruct result; + List children = schema.getChildren(); + int numberOfChildren = children.size(); + if (previous == null || previous.getClass() != OrcStruct.class) { + result = new OrcStruct(numberOfChildren); + previous = result; + } else { + result = (OrcStruct) previous; + if (result.getNumFields() != numberOfChildren) { + result.setNumFields(numberOfChildren); + } + } + for(int i=0; i < numberOfChildren; ++i) { + result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch, + children.get(i), result.getFieldValue(i))); + } + } else { + previous = nextValue(batch.cols[0], rowInBatch, schema, previous); } + rowInBatch += 1; + return previous; + } - @Override - public long getNext() { - return entry.getPositions(index++); + public boolean nextBatch(VectorizedRowBatch theirBatch) throws IOException { + // If the user hasn't been reading by row, use the fast path. + if (rowInBatch >= batch.size) { + return super.nextBatch(theirBatch); } + copyIntoBatch(theirBatch, batch, rowInBatch); + rowInBatch += theirBatch.size; + return theirBatch.size > 0; } - OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException { - return dataReader.readStripeFooter(stripe); + @Override + public void close() throws IOException { + super.close(); + // free the memory for the column vectors + batch.cols = null; } - enum Location { - BEFORE, MIN, MIDDLE, MAX, AFTER - } + /* Routines for stubbing into Writables */ - /** - * Given a point and min and max, determine if the point is before, at the - * min, in the middle, at the max, or after the range. - * @param point the point to test - * @param min the minimum point - * @param max the maximum point - * @param the type of the comparision - * @return the location of the point - */ - static Location compareToRange(Comparable point, T min, T max) { - int minCompare = point.compareTo(min); - if (minCompare < 0) { - return Location.BEFORE; - } else if (minCompare == 0) { - return Location.MIN; + static BooleanWritable nextBoolean(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - int maxCompare = point.compareTo(max); - if (maxCompare > 0) { - return Location.AFTER; - } else if (maxCompare == 0) { - return Location.MAX; + if (vector.noNulls || !vector.isNull[row]) { + BooleanWritable result; + if (previous == null || previous.getClass() != BooleanWritable.class) { + result = new BooleanWritable(); + } else { + result = (BooleanWritable) previous; + } + result.set(((LongColumnVector) vector).vector[row] != 0); + return result; + } else { + return null; } - return Location.MIDDLE; } - /** - * Get the maximum value out of an index entry. - * @param index - * the index entry - * @return the object for the maximum value or null if there isn't one - */ - static Object getMax(ColumnStatistics index) { - if (index instanceof IntegerColumnStatistics) { - return ((IntegerColumnStatistics) index).getMaximum(); - } else if (index instanceof DoubleColumnStatistics) { - return ((DoubleColumnStatistics) index).getMaximum(); - } else if (index instanceof StringColumnStatistics) { - return ((StringColumnStatistics) index).getMaximum(); - } else if (index instanceof DateColumnStatistics) { - return ((DateColumnStatistics) index).getMaximum(); - } else if (index instanceof DecimalColumnStatistics) { - return ((DecimalColumnStatistics) index).getMaximum(); - } else if (index instanceof TimestampColumnStatistics) { - return ((TimestampColumnStatistics) index).getMaximum(); - } else if (index instanceof BooleanColumnStatistics) { - if (((BooleanColumnStatistics)index).getTrueCount()!=0) { - return Boolean.TRUE; + static ByteWritable nextByte(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + ByteWritable result; + if (previous == null || previous.getClass() != ByteWritable.class) { + result = new ByteWritable(); } else { - return Boolean.FALSE; + result = (ByteWritable) previous; } + result.set((byte) ((LongColumnVector) vector).vector[row]); + return result; } else { return null; } } - /** - * Get the minimum value out of an index entry. - * @param index - * the index entry - * @return the object for the minimum value or null if there isn't one - */ - static Object getMin(ColumnStatistics index) { - if (index instanceof IntegerColumnStatistics) { - return ((IntegerColumnStatistics) index).getMinimum(); - } else if (index instanceof DoubleColumnStatistics) { - return ((DoubleColumnStatistics) index).getMinimum(); - } else if (index instanceof StringColumnStatistics) { - return ((StringColumnStatistics) index).getMinimum(); - } else if (index instanceof DateColumnStatistics) { - return ((DateColumnStatistics) index).getMinimum(); - } else if (index instanceof DecimalColumnStatistics) { - return ((DecimalColumnStatistics) index).getMinimum(); - } else if (index instanceof TimestampColumnStatistics) { - return ((TimestampColumnStatistics) index).getMinimum(); - } else if (index instanceof BooleanColumnStatistics) { - if (((BooleanColumnStatistics)index).getFalseCount()!=0) { - return Boolean.FALSE; + static ShortWritable nextShort(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + ShortWritable result; + if (previous == null || previous.getClass() != ShortWritable.class) { + result = new ShortWritable(); } else { - return Boolean.TRUE; + result = (ShortWritable) previous; } + result.set((short) ((LongColumnVector) vector).vector[row]); + return result; } else { - return UNKNOWN_VALUE; // null is not safe here + return null; } } - /** - * Evaluate a predicate with respect to the statistics from the column - * that is referenced in the predicate. - * @param statsProto the statistics for the column mentioned in the predicate - * @param predicate the leaf predicate we need to evaluation - * @param bloomFilter - * @return the set of truth values that may be returned for the given - * predicate. - */ - static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, - PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) { - ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto); - Object minValue = getMin(cs); - Object maxValue = getMax(cs); - BloomFilterIO bf = null; - if (bloomFilter != null) { - bf = new BloomFilterIO(bloomFilter); + static IntWritable nextInt(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf); - } - - /** - * Evaluate a predicate with respect to the statistics from the column - * that is referenced in the predicate. - * @param stats the statistics for the column mentioned in the predicate - * @param predicate the leaf predicate we need to evaluation - * @return the set of truth values that may be returned for the given - * predicate. - */ - static TruthValue evaluatePredicate(ColumnStatistics stats, - PredicateLeaf predicate, BloomFilterIO bloomFilter) { - Object minValue = getMin(stats); - Object maxValue = getMax(stats); - return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter); - } - - static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min, - Object max, boolean hasNull, BloomFilterIO bloomFilter) { - // if we didn't have any values, everything must have been null - if (min == null) { - if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) { - return TruthValue.YES; + if (vector.noNulls || !vector.isNull[row]) { + IntWritable result; + if (previous == null || previous.getClass() != IntWritable.class) { + result = new IntWritable(); } else { - return TruthValue.NULL; + result = (IntWritable) previous; } - } else if (min == UNKNOWN_VALUE) { - return TruthValue.YES_NO_NULL; + result.set((int) ((LongColumnVector) vector).vector[row]); + return result; + } else { + return null; } + } - TruthValue result; - Object baseObj = predicate.getLiteral(); - try { - // Predicate object and stats objects are converted to the type of the predicate object. - Object minValue = getBaseObjectForComparison(predicate.getType(), min); - Object maxValue = getBaseObjectForComparison(predicate.getType(), max); - Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj); - - result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull); - if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) { - result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull); - } - // in case failed conversion, return the default YES_NO_NULL truth value - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - final String statsType = min == null ? - (max == null ? "null" : max.getClass().getSimpleName()) : - min.getClass().getSimpleName(); - final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName(); - final String reason = e.getClass().getSimpleName() + " when evaluating predicate." + - " Skipping ORC PPD." + - " Exception: " + e.getMessage() + - " StatsType: " + statsType + - " PredicateType: " + predicateType; - LOG.warn(reason); - LOG.debug(reason, e); - } - if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) { - result = TruthValue.YES_NO; + static LongWritable nextLong(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + LongWritable result; + if (previous == null || previous.getClass() != LongWritable.class) { + result = new LongWritable(); } else { - result = TruthValue.YES_NO_NULL; + result = (LongWritable) previous; } + result.set(((LongColumnVector) vector).vector[row]); + return result; + } else { + return null; } - return result; } - private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate, - TruthValue result, BloomFilterIO bloomFilter) { - // evaluate bloom filter only when - // 1) Bloom filter is available - // 2) Min/Max evaluation yield YES or MAYBE - // 3) Predicate is EQUALS or IN list - if (bloomFilter != null - && result != TruthValue.NO_NULL && result != TruthValue.NO - && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS) - || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) - || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) { - return true; + static FloatWritable nextFloat(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - return false; - } - - private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj, - Object minValue, - Object maxValue, - boolean hasNull) { - Location loc; - - switch (predicate.getOperator()) { - case NULL_SAFE_EQUALS: - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (loc == Location.BEFORE || loc == Location.AFTER) { - return TruthValue.NO; - } else { - return TruthValue.YES_NO; - } - case EQUALS: - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (minValue.equals(maxValue) && loc == Location.MIN) { - return hasNull ? TruthValue.YES_NULL : TruthValue.YES; - } else if (loc == Location.BEFORE || loc == Location.AFTER) { - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - case LESS_THAN: - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (loc == Location.AFTER) { - return hasNull ? TruthValue.YES_NULL : TruthValue.YES; - } else if (loc == Location.BEFORE || loc == Location.MIN) { - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - case LESS_THAN_EQUALS: - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (loc == Location.AFTER || loc == Location.MAX) { - return hasNull ? TruthValue.YES_NULL : TruthValue.YES; - } else if (loc == Location.BEFORE) { - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - case IN: - if (minValue.equals(maxValue)) { - // for a single value, look through to see if that value is in the - // set - for (Object arg : predicate.getLiteralList()) { - predObj = getBaseObjectForComparison(predicate.getType(), arg); - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (loc == Location.MIN) { - return hasNull ? TruthValue.YES_NULL : TruthValue.YES; - } - } - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - // are all of the values outside of the range? - for (Object arg : predicate.getLiteralList()) { - predObj = getBaseObjectForComparison(predicate.getType(), arg); - loc = compareToRange((Comparable) predObj, minValue, maxValue); - if (loc == Location.MIN || loc == Location.MIDDLE || - loc == Location.MAX) { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - } - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } - case BETWEEN: - List args = predicate.getLiteralList(); - Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0)); - - loc = compareToRange((Comparable) predObj1, minValue, maxValue); - if (loc == Location.BEFORE || loc == Location.MIN) { - Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1)); - - Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue); - if (loc2 == Location.AFTER || loc2 == Location.MAX) { - return hasNull ? TruthValue.YES_NULL : TruthValue.YES; - } else if (loc2 == Location.BEFORE) { - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - } else if (loc == Location.AFTER) { - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - } else { - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; - } - case IS_NULL: - // min = null condition above handles the all-nulls YES case - return hasNull ? TruthValue.YES_NO : TruthValue.NO; - default: - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + if (vector.noNulls || !vector.isNull[row]) { + FloatWritable result; + if (previous == null || previous.getClass() != FloatWritable.class) { + result = new FloatWritable(); + } else { + result = (FloatWritable) previous; + } + result.set((float) ((DoubleColumnVector) vector).vector[row]); + return result; + } else { + return null; } } - private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate, - final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) { - switch (predicate.getOperator()) { - case NULL_SAFE_EQUALS: - // null safe equals does not return *_NULL variant. So set hasNull to false - return checkInBloomFilter(bloomFilter, predObj, false); - case EQUALS: - return checkInBloomFilter(bloomFilter, predObj, hasNull); - case IN: - for (Object arg : predicate.getLiteralList()) { - // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe - Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg); - TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull); - if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) { - return result; - } - } - return hasNull ? TruthValue.NO_NULL : TruthValue.NO; - default: - return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + static DoubleWritable nextDouble(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - } - - private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) { - TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO; - - if (predObj instanceof Long) { - if (bf.testLong(((Long) predObj).longValue())) { - result = TruthValue.YES_NO_NULL; - } - } else if (predObj instanceof Double) { - if (bf.testDouble(((Double) predObj).doubleValue())) { - result = TruthValue.YES_NO_NULL; - } - } else if (predObj instanceof String || predObj instanceof Text || - predObj instanceof HiveDecimalWritable || - predObj instanceof BigDecimal) { - if (bf.testString(predObj.toString())) { - result = TruthValue.YES_NO_NULL; - } - } else if (predObj instanceof Timestamp) { - if (bf.testLong(((Timestamp) predObj).getTime())) { - result = TruthValue.YES_NO_NULL; - } - } else if (predObj instanceof TimestampWritable) { - if (bf.testLong(((TimestampWritable) predObj).getTimestamp().getTime())) { - result = TruthValue.YES_NO_NULL; - } - } else if (predObj instanceof Date) { - if (bf.testLong(DateWritable.dateToDays((Date) predObj))) { - result = TruthValue.YES_NO_NULL; + if (vector.noNulls || !vector.isNull[row]) { + DoubleWritable result; + if (previous == null || previous.getClass() != DoubleWritable.class) { + result = new DoubleWritable(); + } else { + result = (DoubleWritable) previous; } + result.set(((DoubleColumnVector) vector).vector[row]); + return result; } else { - // if the predicate object is null and if hasNull says there are no nulls then return NO - if (predObj == null && !hasNull) { - result = TruthValue.NO; - } else { - result = TruthValue.YES_NO_NULL; - } - } - - if (result == TruthValue.YES_NO_NULL && !hasNull) { - result = TruthValue.YES_NO; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Bloom filter evaluation: " + result.toString()); + return null; } - - return result; } - private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) { - if (obj == null) { - return null; + static Text nextString(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - switch (type) { - case BOOLEAN: - if (obj instanceof Boolean) { - return obj; - } else { - // will only be true if the string conversion yields "true", all other values are - // considered false - return Boolean.valueOf(obj.toString()); - } - case DATE: - if (obj instanceof Date) { - return obj; - } else if (obj instanceof String) { - return Date.valueOf((String) obj); - } else if (obj instanceof Timestamp) { - return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L); - } - // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?) - break; - case DECIMAL: - if (obj instanceof Boolean) { - return new HiveDecimalWritable(((Boolean) obj).booleanValue() ? - HiveDecimal.ONE : HiveDecimal.ZERO); - } else if (obj instanceof Integer) { - return new HiveDecimalWritable(((Integer) obj).intValue()); - } else if (obj instanceof Long) { - return new HiveDecimalWritable(((Long) obj)); - } else if (obj instanceof Float || obj instanceof Double || - obj instanceof String) { - return new HiveDecimalWritable(obj.toString()); - } else if (obj instanceof BigDecimal) { - return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj)); - } else if (obj instanceof HiveDecimal) { - return new HiveDecimalWritable((HiveDecimal) obj); - } else if (obj instanceof HiveDecimalWritable) { - return obj; - } else if (obj instanceof Timestamp) { - return new HiveDecimalWritable( - new Double(new TimestampWritable((Timestamp) obj).getDouble()).toString()); - } - break; - case FLOAT: - if (obj instanceof Number) { - // widening conversion - return ((Number) obj).doubleValue(); - } else if (obj instanceof HiveDecimal) { - return ((HiveDecimal) obj).doubleValue(); - } else if (obj instanceof String) { - return Double.valueOf(obj.toString()); - } else if (obj instanceof Timestamp) { - return new TimestampWritable((Timestamp)obj).getDouble(); - } else if (obj instanceof HiveDecimal) { - return ((HiveDecimal) obj).doubleValue(); - } else if (obj instanceof BigDecimal) { - return ((BigDecimal) obj).doubleValue(); - } - break; - case LONG: - if (obj instanceof Number) { - // widening conversion - return ((Number) obj).longValue(); - } else if (obj instanceof HiveDecimal) { - return ((HiveDecimal) obj).longValue(); - } else if (obj instanceof String) { - return Long.valueOf(obj.toString()); - } - break; - case STRING: - if (obj != null) { - return (obj.toString()); - } - break; - case TIMESTAMP: - if (obj instanceof Timestamp) { - return obj; - } else if (obj instanceof Integer) { - return TimestampWritable.longToTimestamp(((Number) obj).longValue(), false); - } else if (obj instanceof Float) { - return TimestampWritable.doubleToTimestamp(((Float) obj).doubleValue()); - } else if (obj instanceof Double) { - return TimestampWritable.doubleToTimestamp(((Double) obj).doubleValue()); - } else if (obj instanceof HiveDecimal) { - return TimestampWritable.decimalToTimestamp((HiveDecimal) obj); - } else if (obj instanceof HiveDecimalWritable) { - return TimestampWritable.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal()); - } else if (obj instanceof Date) { - return new Timestamp(((Date) obj).getTime()); - } - // float/double conversion to timestamp is interpreted as seconds whereas integer conversion - // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting - // is also config driven. The filter operator changes its promotion based on config: - // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases. - break; - default: - break; + if (vector.noNulls || !vector.isNull[row]) { + Text result; + if (previous == null || previous.getClass() != Text.class) { + result = new Text(); + } else { + result = (Text) previous; + } + BytesColumnVector bytes = (BytesColumnVector) vector; + result.set(bytes.vector[row], bytes.start[row], bytes.length[row]); + return result; + } else { + return null; } - - throw new IllegalArgumentException(String.format( - "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass() - .getSimpleName(), type)); } - public static class SargApplier { - public final static boolean[] READ_ALL_RGS = null; - public final static boolean[] READ_NO_RGS = new boolean[0]; - - private final SearchArgument sarg; - private final List sargLeaves; - private final int[] filterColumns; - private final long rowIndexStride; - // same as the above array, but indices are set to true - private final boolean[] sargColumns; - - public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride, - List types, int includedCount) { - this.sarg = sarg; - sargLeaves = sarg.getLeaves(); - filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0); - this.rowIndexStride = rowIndexStride; - // included will not be null, row options will fill the array with trues if null - sargColumns = new boolean[includedCount]; - for (int i : filterColumns) { - // filter columns may have -1 as index which could be partition column in SARG. - if (i > 0) { - sargColumns[i] = true; - } - } + static HiveCharWritable nextChar(ColumnVector vector, + int row, + int size, + Object previous) { + if (vector.isRepeating) { + row = 0; } - - /** - * Pick the row groups that we need to load from the current stripe. - * - * @return an array with a boolean for each row group or null if all of the - * row groups must be read. - * @throws IOException - */ - public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes, - OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException { - long rowsInStripe = stripe.getNumberOfRows(); - int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); - boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc? - TruthValue[] leafValues = new TruthValue[sargLeaves.size()]; - boolean hasSelected = false, hasSkipped = false; - for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) { - for (int pred = 0; pred < leafValues.length; ++pred) { - int columnIx = filterColumns[pred]; - if (columnIx != -1) { - if (indexes[columnIx] == null) { - throw new AssertionError("Index is not populated for " + columnIx); - } - OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup); - if (entry == null) { - throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup); - } - OrcProto.ColumnStatistics stats = entry.getStatistics(); - OrcProto.BloomFilter bf = null; - if (bloomFilterIndices != null && bloomFilterIndices[filterColumns[pred]] != null) { - bf = bloomFilterIndices[filterColumns[pred]].getBloomFilter(rowGroup); - } - leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf); - if (LOG.isTraceEnabled()) { - LOG.trace("Stats = " + stats); - LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]); - } - } else { - // the column is a virtual column - leafValues[pred] = TruthValue.YES_NO_NULL; - } - } - result[rowGroup] = sarg.evaluate(leafValues).isNeeded(); - hasSelected = hasSelected || result[rowGroup]; - hasSkipped = hasSkipped || (!result[rowGroup]); - if (LOG.isDebugEnabled()) { - LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " + - (rowIndexStride * (rowGroup + 1) - 1) + " is " + - (result[rowGroup] ? "" : "not ") + "included."); - } + if (vector.noNulls || !vector.isNull[row]) { + HiveCharWritable result; + if (previous == null || previous.getClass() != HiveCharWritable.class) { + result = new HiveCharWritable(); + } else { + result = (HiveCharWritable) previous; } - - return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS; + BytesColumnVector bytes = (BytesColumnVector) vector; + result.set(bytes.toString(row), size); + return result; + } else { + return null; } } - /** - * Pick the row groups that we need to load from the current stripe. - * - * @return an array with a boolean for each row group or null if all of the - * row groups must be read. - * @throws IOException - */ - protected boolean[] pickRowGroups() throws IOException { - // if we don't have a sarg or indexes, we read everything - if (sargApp == null) { + static HiveVarcharWritable nextVarchar(ColumnVector vector, + int row, + int size, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + HiveVarcharWritable result; + if (previous == null || previous.getClass() != HiveVarcharWritable.class) { + result = new HiveVarcharWritable(); + } else { + result = (HiveVarcharWritable) previous; + } + BytesColumnVector bytes = (BytesColumnVector) vector; + result.set(bytes.toString(row), size); + return result; + } else { return null; } - readRowIndex(currentStripe, included, sargApp.sargColumns); - return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false); } - private void clearStreams() { - // explicit close of all streams to de-ref ByteBuffers - for (InStream is : streams.values()) { - is.close(); + static BytesWritable nextBinary(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - if (bufferChunks != null) { - if (dataReader.isTrackingDiskRanges()) { - for (DiskRangeList range = bufferChunks; range != null; range = range.next) { - if (!(range instanceof BufferChunk)) { - continue; - } - dataReader.releaseBuffer(((BufferChunk) range).getChunk()); - } + if (vector.noNulls || !vector.isNull[row]) { + BytesWritable result; + if (previous == null || previous.getClass() != BytesWritable.class) { + result = new BytesWritable(); + } else { + result = (BytesWritable) previous; } + BytesColumnVector bytes = (BytesColumnVector) vector; + result.set(bytes.vector[row], bytes.start[row], bytes.length[row]); + return result; + } else { + return null; } - bufferChunks = null; - streams.clear(); } - /** - * Read the current stripe into memory. - * - * @throws IOException - */ - private void readStripe() throws IOException { - StripeInformation stripe = beginReadStripe(); - includedRowGroups = pickRowGroups(); - - // move forward to the first unskipped row - if (includedRowGroups != null) { - while (rowInStripe < rowCountInStripe && - !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) { - rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride); - } + static HiveDecimalWritable nextDecimal(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - - // if we haven't skipped the whole stripe, read the data - if (rowInStripe < rowCountInStripe) { - // if we aren't projecting columns or filtering rows, just read it all - if (included == null && includedRowGroups == null) { - readAllDataStreams(stripe); + if (vector.noNulls || !vector.isNull[row]) { + HiveDecimalWritable result; + if (previous == null || previous.getClass() != HiveDecimalWritable.class) { + result = new HiveDecimalWritable(); } else { - readPartialDataStreams(stripe); - } - reader.startStripe(streams, stripeFooter); - // if we skipped the first row group, move the pointers forward - if (rowInStripe != 0) { - seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride)); + result = (HiveDecimalWritable) previous; } + result.set(((DecimalColumnVector) vector).vector[row]); + return result; + } else { + return null; } } - private StripeInformation beginReadStripe() throws IOException { - StripeInformation stripe = stripes.get(currentStripe); - stripeFooter = readStripeFooter(stripe); - clearStreams(); - // setup the position in the stripe - rowCountInStripe = stripe.getNumberOfRows(); - rowInStripe = 0; - rowBaseInStripe = 0; - for (int i = 0; i < currentStripe; ++i) { - rowBaseInStripe += stripes.get(i).getNumberOfRows(); + static DateWritable nextDate(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; } - // reset all of the indexes - for (int i = 0; i < indexes.length; ++i) { - indexes[i] = null; + if (vector.noNulls || !vector.isNull[row]) { + DateWritable result; + if (previous == null || previous.getClass() != DateWritable.class) { + result = new DateWritable(); + } else { + result = (DateWritable) previous; + } + int date = (int) ((LongColumnVector) vector).vector[row]; + result.set(date); + return result; + } else { + return null; } - return stripe; } - private void readAllDataStreams(StripeInformation stripe) throws IOException { - long start = stripe.getIndexLength(); - long end = start + stripe.getDataLength(); - // explicitly trigger 1 big read - DiskRangeList toRead = new DiskRangeList(start, end); - bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); - List streamDescriptions = stripeFooter.getStreamsList(); - createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams); - } - - /** - * Plan the ranges of the file that we need to read given the list of - * columns and row groups. - * - * @param streamList the list of streams available - * @param indexes the indexes that have been loaded - * @param includedColumns which columns are needed - * @param includedRowGroups which row groups are needed - * @param isCompressed does the file have generic compression - * @param encodings the encodings for each column - * @param types the types of the columns - * @param compressionSize the compression block size - * @return the list of disk ranges that will be loaded - */ - static DiskRangeList planReadPartialDataStreams - (List streamList, - OrcProto.RowIndex[] indexes, - boolean[] includedColumns, - boolean[] includedRowGroups, - boolean isCompressed, - List encodings, - List types, - int compressionSize, - boolean doMergeBuffers) { - long offset = 0; - // figure out which columns have a present stream - boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types); - CreateHelper list = new CreateHelper(); - for (OrcProto.Stream stream : streamList) { - long length = stream.getLength(); - int column = stream.getColumn(); - OrcProto.Stream.Kind streamKind = stream.getKind(); - // since stream kind is optional, first check if it exists - if (stream.hasKind() && - (StreamName.getArea(streamKind) == StreamName.Area.DATA) && - (column < includedColumns.length && includedColumns[column])) { - // if we aren't filtering or it is a dictionary, load it. - if (includedRowGroups == null - || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) { - RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers); - } else { - RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups, - isCompressed, indexes[column], encodings.get(column), types.get(column), - compressionSize, hasNull[column], offset, length, list, doMergeBuffers); - } + static TimestampWritable nextTimestamp(ColumnVector vector, + int row, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + TimestampWritable result; + if (previous == null || previous.getClass() != TimestampWritable.class) { + result = new TimestampWritable(); + } else { + result = (TimestampWritable) previous; } - offset += length; + TimestampColumnVector tcv = (TimestampColumnVector) vector; + result.setInternal(tcv.time[row], tcv.nanos[row]); + return result; + } else { + return null; } - return list.extract(); } - void createStreams(List streamDescriptions, - DiskRangeList ranges, - boolean[] includeColumn, - CompressionCodec codec, - int bufferSize, - Map streams) throws IOException { - long streamOffset = 0; - for (OrcProto.Stream streamDesc : streamDescriptions) { - int column = streamDesc.getColumn(); - if ((includeColumn != null && - (column < included.length && !includeColumn[column])) || - streamDesc.hasKind() && - (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) { - streamOffset += streamDesc.getLength(); - continue; + static OrcStruct nextStruct(ColumnVector vector, + int row, + TypeDescription schema, + Object previous) { + if (vector.isRepeating) { + row = 0; + } + if (vector.noNulls || !vector.isNull[row]) { + OrcStruct result; + List childrenTypes = schema.getChildren(); + int numChildren = childrenTypes.size(); + if (previous == null || previous.getClass() != OrcStruct.class) { + result = new OrcStruct(numChildren); + } else { + result = (OrcStruct) previous; + result.setNumFields(numChildren); + } + StructColumnVector struct = (StructColumnVector) vector; + for(int f=0; f < numChildren; ++f) { + result.setFieldValue(f, nextValue(struct.fields[f], row, + childrenTypes.get(f), result.getFieldValue(f))); } - List buffers = RecordReaderUtils.getStreamBuffers( - ranges, streamOffset, streamDesc.getLength()); - StreamName name = new StreamName(column, streamDesc.getKind()); - streams.put(name, InStream.create(name.toString(), buffers, - streamDesc.getLength(), codec, bufferSize)); - streamOffset += streamDesc.getLength(); + return result; + } else { + return null; } } - private void readPartialDataStreams(StripeInformation stripe) throws IOException { - List streamList = stripeFooter.getStreamsList(); - DiskRangeList toRead = planReadPartialDataStreams(streamList, - indexes, included, includedRowGroups, codec != null, - stripeFooter.getColumnsList(), types, bufferSize, true); - if (LOG.isDebugEnabled()) { - LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead)); + static OrcUnion nextUnion(ColumnVector vector, + int row, + TypeDescription schema, + Object previous) { + if (vector.isRepeating) { + row = 0; } - bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false); - if (LOG.isDebugEnabled()) { - LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks)); + if (vector.noNulls || !vector.isNull[row]) { + OrcUnion result; + List childrenTypes = schema.getChildren(); + if (previous == null || previous.getClass() != OrcUnion.class) { + result = new OrcUnion(); + } else { + result = (OrcUnion) previous; + } + UnionColumnVector union = (UnionColumnVector) vector; + byte tag = (byte) union.tags[row]; + result.set(tag, nextValue(union.fields[tag], row, childrenTypes.get(tag), + result.getObject())); + return result; + } else { + return null; } - - createStreams(streamList, bufferChunks, included, codec, bufferSize, streams); - } - - @Override - public boolean hasNext() throws IOException { - return rowInStripe < rowCountInStripe; } - /** - * Read the next stripe until we find a row that we don't skip. - * - * @throws IOException - */ - private void advanceStripe() throws IOException { - rowInStripe = rowCountInStripe; - while (rowInStripe >= rowCountInStripe && - currentStripe < stripes.size() - 1) { - currentStripe += 1; - readStripe(); + static ArrayList nextList(ColumnVector vector, + int row, + TypeDescription schema, + Object previous) { + if (vector.isRepeating) { + row = 0; } - } - - /** - * Skip over rows that we aren't selecting, so that the next row is - * one that we will read. - * - * @param nextRow the row we want to go to - * @throws IOException - */ - private boolean advanceToNextRow( - TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe) - throws IOException { - long nextRowInStripe = nextRow - rowBaseInStripe; - // check for row skipping - if (rowIndexStride != 0 && - includedRowGroups != null && - nextRowInStripe < rowCountInStripe) { - int rowGroup = (int) (nextRowInStripe / rowIndexStride); - if (!includedRowGroups[rowGroup]) { - while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) { - rowGroup += 1; - } - if (rowGroup >= includedRowGroups.length) { - if (canAdvanceStripe) { - advanceStripe(); - } - return canAdvanceStripe; + if (vector.noNulls || !vector.isNull[row]) { + ArrayList result; + if (previous == null || previous.getClass() != ArrayList.class) { + result = new ArrayList<>(); + } else { + result = (ArrayList) previous; + } + ListColumnVector list = (ListColumnVector) vector; + int length = (int) list.lengths[row]; + int offset = (int) list.offsets[row]; + result.ensureCapacity(length); + int oldLength = result.size(); + int idx = 0; + TypeDescription childType = schema.getChildren().get(0); + while (idx < length && idx < oldLength) { + result.set(idx, nextValue(list.child, offset + idx, childType, + result.get(idx))); + idx += 1; + } + if (length < oldLength) { + result.subList(length,result.size()).clear(); + } else if (oldLength < length) { + while (idx < length) { + result.add(nextValue(list.child, offset + idx, childType, null)); + idx += 1; } - nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride); } + return result; + } else { + return null; } - if (nextRowInStripe >= rowCountInStripe) { - if (canAdvanceStripe) { - advanceStripe(); - } - return canAdvanceStripe; + } + + static HashMap nextMap(ColumnVector vector, + int row, + TypeDescription schema, + Object previous) { + if (vector.isRepeating) { + row = 0; } - if (nextRowInStripe != rowInStripe) { - if (rowIndexStride != 0) { - int rowGroup = (int) (nextRowInStripe / rowIndexStride); - seekToRowEntry(reader, rowGroup); - reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride); + if (vector.noNulls || !vector.isNull[row]) { + MapColumnVector map = (MapColumnVector) vector; + int length = (int) map.lengths[row]; + int offset = (int) map.offsets[row]; + TypeDescription keyType = schema.getChildren().get(0); + TypeDescription valueType = schema.getChildren().get(1); + HashMap result; + if (previous == null || previous.getClass() != HashMap.class) { + result = new HashMap(length); } else { - reader.skipRows(nextRowInStripe - rowInStripe); + result = (HashMap) previous; + // I couldn't think of a good way to reuse the keys and value objects + // without even more allocations, so take the easy and safe approach. + result.clear(); } - rowInStripe = nextRowInStripe; + for(int e=0; e < length; ++e) { + result.put(nextValue(map.keys, e + offset, keyType, null), + nextValue(map.values, e + offset, valueType, null)); + } + return result; + } else { + return null; } - return true; } - @Override - public Object next(Object previous) throws IOException { - try { - final Object result = reader.next(previous); - // find the next row - rowInStripe += 1; - advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true); - return result; - } catch (IOException e) { - // Rethrow exception with file name in log message - throw new IOException("Error reading file: " + path, e); + static Object nextValue(ColumnVector vector, + int row, + TypeDescription schema, + Object previous) { + switch (schema.getCategory()) { + case BOOLEAN: + return nextBoolean(vector, row, previous); + case BYTE: + return nextByte(vector, row, previous); + case SHORT: + return nextShort(vector, row, previous); + case INT: + return nextInt(vector, row, previous); + case LONG: + return nextLong(vector, row, previous); + case FLOAT: + return nextFloat(vector, row, previous); + case DOUBLE: + return nextDouble(vector, row, previous); + case STRING: + return nextString(vector, row, previous); + case CHAR: + return nextChar(vector, row, schema.getMaxLength(), previous); + case VARCHAR: + return nextVarchar(vector, row, schema.getMaxLength(), previous); + case BINARY: + return nextBinary(vector, row, previous); + case DECIMAL: + return nextDecimal(vector, row, previous); + case DATE: + return nextDate(vector, row, previous); + case TIMESTAMP: + return nextTimestamp(vector, row, previous); + case STRUCT: + return nextStruct(vector, row, schema, previous); + case UNION: + return nextUnion(vector, row, schema, previous); + case LIST: + return nextList(vector, row, schema, previous); + case MAP: + return nextMap(vector, row, schema, previous); + default: + throw new IllegalArgumentException("Unknown type " + schema); } } - @Override - public boolean nextBatch(VectorizedRowBatch batch) throws IOException { - try { - if (rowInStripe >= rowCountInStripe) { - currentStripe += 1; - if (currentStripe >= stripes.size()) { - batch.size = 0; - return false; + /* Routines for copying between VectorizedRowBatches */ + + void copyLongColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + LongColumnVector lsource = (LongColumnVector) source; + LongColumnVector ldest = (LongColumnVector) destination; + ldest.isRepeating = lsource.isRepeating; + ldest.noNulls = lsource.noNulls; + if (source.isRepeating) { + ldest.isNull[0] = lsource.isNull[0]; + ldest.vector[0] = lsource.vector[0]; + } else { + if (!lsource.noNulls) { + for(int r=0; r < length; ++r) { + ldest.isNull[r] = lsource.isNull[sourceOffset + r]; + ldest.vector[r] = lsource.vector[sourceOffset + r]; + } + } else { + for (int r = 0; r < length; ++r) { + ldest.vector[r] = lsource.vector[sourceOffset + r]; } - readStripe(); } - - int batchSize = computeBatchSize(batch.getMaxSize()); - - rowInStripe += batchSize; - reader.setVectorColumnCount(batch.getDataColumnCount()); - reader.nextBatch(batch, batchSize); - - batch.size = (int) batchSize; - batch.selectedInUse = false; - advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true); - return batch.size != 0; - } catch (IOException e) { - // Rethrow exception with file name in log message - throw new IOException("Error reading file: " + path, e); } } - private int computeBatchSize(long targetBatchSize) { - final int batchSize; - // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row - // groups are selected then marker position is set to the end of range (subset of row groups - // within strip). Batch size computed out of marker position makes sure that batch size is - // aware of row group boundary and will not cause overflow when reading rows - // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287 - if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) { - int startRowGroup = (int) (rowInStripe / rowIndexStride); - if (!includedRowGroups[startRowGroup]) { - while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) { - startRowGroup += 1; + void copyDoubleColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + DoubleColumnVector castedSource = (DoubleColumnVector) source; + DoubleColumnVector castedDestination = (DoubleColumnVector) destination; + if (source.isRepeating) { + castedDestination.isRepeating = true; + castedDestination.noNulls = castedSource.noNulls; + castedDestination.isNull[0] = castedSource.isNull[0]; + castedDestination.vector[0] = castedSource.vector[0]; + } else { + if (!castedSource.noNulls) { + castedDestination.noNulls = true; + for(int r=0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; } } - - int endRowGroup = startRowGroup; - while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) { - endRowGroup += 1; - } - - final long markerPosition = - (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride) - : rowCountInStripe; - batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe)); - - if (isLogDebugEnabled && batchSize < targetBatchSize) { - LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize); + for(int r=0; r < length; ++r) { + castedDestination.vector[r] = castedSource.vector[sourceOffset + r]; } - } else { - batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe)); } - return batchSize; - } - - @Override - public void close() throws IOException { - clearStreams(); - dataReader.close(); - } - - @Override - public long getRowNumber() { - return rowInStripe + rowBaseInStripe + firstRow; - } - - /** - * Return the fraction of rows that have been read from the selected. - * section of the file - * - * @return fraction between 0.0 and 1.0 of rows consumed - */ - @Override - public float getProgress() { - return ((float) rowBaseInStripe + rowInStripe) / totalRowCount; } - private int findStripe(long rowNumber) { - for (int i = 0; i < stripes.size(); i++) { - StripeInformation stripe = stripes.get(i); - if (stripe.getNumberOfRows() > rowNumber) { - return i; + void copyTimestampColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + TimestampColumnVector castedSource = (TimestampColumnVector) source; + TimestampColumnVector castedDestination = (TimestampColumnVector) destination; + castedDestination.isRepeating = castedSource.isRepeating; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + castedDestination.time[0] = castedSource.time[0]; + castedDestination.nanos[0] = castedSource.nanos[0]; + } else { + if (!castedSource.noNulls) { + castedDestination.noNulls = true; + for(int r=0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + castedDestination.time[r] = castedSource.time[sourceOffset + r]; + castedDestination.nanos[r] = castedSource.nanos[sourceOffset + r]; + } + } else { + for (int r = 0; r < length; ++r) { + castedDestination.time[r] = castedSource.time[sourceOffset + r]; + castedDestination.nanos[r] = castedSource.nanos[sourceOffset + r]; + } } - rowNumber -= stripe.getNumberOfRows(); } - throw new IllegalArgumentException("Seek after the end of reader range"); } - OrcIndex readRowIndex( - int stripeIndex, boolean[] included, boolean[] sargColumns) throws IOException { - return readRowIndex(stripeIndex, included, null, null, sargColumns); + void copyDecimalColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + DecimalColumnVector castedSource = (DecimalColumnVector) source; + DecimalColumnVector castedDestination = (DecimalColumnVector) destination; + castedDestination.isRepeating = castedSource.isRepeating; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + if (!castedSource.isNull[0]) { + castedDestination.set(0, castedSource.vector[0]); + } + } else { + if (!castedSource.noNulls) { + for(int r=0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + if (!castedDestination.isNull[r]) { + castedDestination.set(r, castedSource.vector[r]); + } + } + } else { + for (int r = 0; r < length; ++r) { + castedDestination.set(r, castedSource.vector[r]); + } + } + } } - OrcIndex readRowIndex(int stripeIndex, boolean[] included, OrcProto.RowIndex[] indexes, - OrcProto.BloomFilterIndex[] bloomFilterIndex, boolean[] sargColumns) throws IOException { - StripeInformation stripe = stripes.get(stripeIndex); - OrcProto.StripeFooter stripeFooter = null; - // if this is the current stripe, use the cached objects. - if (stripeIndex == currentStripe) { - stripeFooter = this.stripeFooter; - indexes = indexes == null ? this.indexes : indexes; - bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex; - sargColumns = sargColumns == null ? - (sargApp == null ? null : sargApp.sargColumns) : sargColumns; + void copyBytesColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + BytesColumnVector castedSource = (BytesColumnVector) source; + BytesColumnVector castedDestination = (BytesColumnVector) destination; + castedDestination.isRepeating = castedSource.isRepeating; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + if (!castedSource.isNull[0]) { + castedDestination.setVal(0, castedSource.vector[0], + castedSource.start[0], castedSource.length[0]); + } + } else { + if (!castedSource.noNulls) { + for(int r=0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + if (!castedDestination.isNull[r]) { + castedDestination.setVal(r, castedSource.vector[sourceOffset + r], + castedSource.start[sourceOffset + r], + castedSource.length[sourceOffset + r]); + } + } + } else { + for (int r = 0; r < length; ++r) { + castedDestination.setVal(r, castedSource.vector[sourceOffset + r], + castedSource.start[sourceOffset + r], + castedSource.length[sourceOffset + r]); + } + } } - return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, - sargColumns, bloomFilterIndex); } - private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry) - throws IOException { - PositionProvider[] index = new PositionProvider[indexes.length]; - for (int i = 0; i < indexes.length; ++i) { - if (indexes[i] != null) { - index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry)); + void copyStructColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + StructColumnVector castedSource = (StructColumnVector) source; + StructColumnVector castedDestination = (StructColumnVector) destination; + castedDestination.isRepeating = castedSource.isRepeating; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + for(int c=0; c > castedSource.fields.length; ++c) { + copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 1); + } + } else { + if (!castedSource.noNulls) { + for (int r = 0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + } + } else { + for (int c = 0; c > castedSource.fields.length; ++c) { + copyColumn(castedDestination.fields[c], castedSource.fields[c], + sourceOffset, length); + } } } - reader.seek(index); } - @Override - public void seekToRow(long rowNumber) throws IOException { - if (rowNumber < 0) { - throw new IllegalArgumentException("Seek to a negative row number " + - rowNumber); - } else if (rowNumber < firstRow) { - throw new IllegalArgumentException("Seek before reader range " + - rowNumber); + void copyUnionColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + UnionColumnVector castedSource = (UnionColumnVector) source; + UnionColumnVector castedDestination = (UnionColumnVector) destination; + castedDestination.isRepeating = castedSource.isRepeating; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + int tag = castedSource.tags[0]; + castedDestination.tags[0] = tag; + if (!castedDestination.isNull[0]) { + copyColumn(castedDestination.fields[tag], castedSource.fields[tag], 0, + 1); + } + } else { + if (!castedSource.noNulls) { + for (int r = 0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + castedDestination.tags[r] = castedSource.tags[sourceOffset + r]; + } + } else { + for(int r=0; r < length; ++r) { + castedDestination.tags[r] = castedSource.tags[sourceOffset + r]; + } + } + for(int c=0; c > castedSource.fields.length; ++c) { + copyColumn(castedDestination.fields[c], castedSource.fields[c], + sourceOffset, length); + } } - // convert to our internal form (rows from the beginning of slice) - rowNumber -= firstRow; + } - // move to the right stripe - int rightStripe = findStripe(rowNumber); - if (rightStripe != currentStripe) { - currentStripe = rightStripe; - readStripe(); + void copyListColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + ListColumnVector castedSource = (ListColumnVector) source; + ListColumnVector castedDestination = (ListColumnVector) destination; + castedDestination.isRepeating = castedSource.noNulls; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + castedDestination.offsets[0] = 0; + castedDestination.lengths[0] = castedSource.lengths[0]; + copyColumn(castedDestination.child, castedSource.child, + (int) castedSource.offsets[0], (int) castedSource.lengths[0]); + } else { + if (!castedSource.noNulls) { + for (int r = 0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + } + } + int minOffset = Integer.MAX_VALUE; + int maxOffset = Integer.MIN_VALUE; + for(int r=0; r < length; ++r) { + int childOffset = (int) castedSource.offsets[r + sourceOffset]; + int childLength = (int) castedSource.lengths[r + sourceOffset]; + castedDestination.offsets[r] = childOffset; + castedDestination.lengths[r] = childLength; + minOffset = Math.min(minOffset, childOffset); + maxOffset = Math.max(maxOffset, childOffset + childLength); + } + if (minOffset <= maxOffset) { + castedDestination.childCount = maxOffset - minOffset + 1; + copyColumn(castedDestination.child, castedSource.child, + minOffset, castedDestination.childCount); + } else { + castedDestination.childCount = 0; + } + } + } + + void copyMapColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + MapColumnVector castedSource = (MapColumnVector) source; + MapColumnVector castedDestination = (MapColumnVector) destination; + castedDestination.isRepeating = castedSource.noNulls; + castedDestination.noNulls = castedSource.noNulls; + if (source.isRepeating) { + castedDestination.isNull[0] = castedSource.isNull[0]; + castedDestination.offsets[0] = 0; + castedDestination.lengths[0] = castedSource.lengths[0]; + copyColumn(castedDestination.keys, castedSource.keys, + (int) castedSource.offsets[0], (int) castedSource.lengths[0]); + copyColumn(castedDestination.values, castedSource.values, + (int) castedSource.offsets[0], (int) castedSource.lengths[0]); + } else { + if (!castedSource.noNulls) { + for (int r = 0; r < length; ++r) { + castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r]; + } + } + int minOffset = Integer.MAX_VALUE; + int maxOffset = Integer.MIN_VALUE; + for(int r=0; r < length; ++r) { + int childOffset = (int) castedSource.offsets[r + sourceOffset]; + int childLength = (int) castedSource.lengths[r + sourceOffset]; + castedDestination.offsets[r] = childOffset; + castedDestination.lengths[r] = childLength; + minOffset = Math.min(minOffset, childOffset); + maxOffset = Math.max(maxOffset, childOffset + childLength); + } + if (minOffset <= maxOffset) { + castedDestination.childCount = maxOffset - minOffset + 1; + copyColumn(castedDestination.keys, castedSource.keys, + minOffset, castedDestination.childCount); + copyColumn(castedDestination.values, castedSource.values, + minOffset, castedDestination.childCount); + } else { + castedDestination.childCount = 0; + } } - readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns); - - // if we aren't to the right row yet, advance in the stripe. - advanceToNextRow(reader, rowNumber, true); } - private static final String TRANSLATED_SARG_SEPARATOR = "_"; - public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) { - return rootColumn + TRANSLATED_SARG_SEPARATOR - + ((indexInSourceTable == null) ? -1 : indexInSourceTable); + void copyColumn(ColumnVector destination, + ColumnVector source, + int sourceOffset, + int length) { + if (source.getClass() == LongColumnVector.class) { + copyLongColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == DoubleColumnVector.class) { + copyDoubleColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == BytesColumnVector.class) { + copyBytesColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == TimestampColumnVector.class) { + copyTimestampColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == DecimalColumnVector.class) { + copyDecimalColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == StructColumnVector.class) { + copyStructColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == UnionColumnVector.class) { + copyUnionColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == ListColumnVector.class) { + copyListColumn(destination, source, sourceOffset, length); + } else if (source.getClass() == MapColumnVector.class) { + copyMapColumn(destination, source, sourceOffset, length); + } } - public static int[] mapTranslatedSargColumns( - List types, List sargLeaves) { - int[] result = new int[sargLeaves.size()]; - OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now. - String lastRootStr = null; - for (int i = 0; i < result.length; ++i) { - String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR); - assert rootAndIndex.length == 2; - String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1]; - int index = Integer.parseInt(indexStr); - // First, check if the column even maps to anything. - if (index == -1) { - result[i] = -1; - continue; - } - assert index >= 0; - // Then, find the root type if needed. - if (!rootStr.equals(lastRootStr)) { - lastRoot = types.get(Integer.parseInt(rootStr)); - lastRootStr = rootStr; - } - // Subtypes of the root types correspond, in order, to the columns in the table schema - // (disregarding schema evolution that doesn't presently work). Get the index for the - // corresponding subtype. - result[i] = lastRoot.getSubtypes(index); - } - return result; + /** + * Copy part of a batch into the destination batch. + * @param destination the batch to copy into + * @param source the batch to copy from + * @param sourceStart the row number to start from in the source + * @return the number of rows copied + */ + void copyIntoBatch(VectorizedRowBatch destination, + VectorizedRowBatch source, + int sourceStart) { + int rows = Math.min(source.size - sourceStart, destination.getMaxSize()); + for(int c=0; c < source.cols.length; ++c) { + destination.cols[c].reset(); + copyColumn(destination.cols[c], source.cols[c], sourceStart, rows); + } + destination.size = rows; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java index 40cc86f7df0e..dad35e3bc586 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java @@ -38,7 +38,7 @@ import org.apache.orc.DataReader; import org.apache.orc.OrcConf; import org.apache.orc.impl.OutStream; -import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils; +import org.apache.orc.impl.RecordReaderUtils; import org.apache.orc.impl.StreamName; import org.apache.orc.StripeInformation; import org.apache.orc.impl.BufferChunk; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java index fe46446d2aa9..b44da0689f98 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java @@ -25,7 +25,7 @@ import org.apache.orc.CompressionCodec; import org.apache.orc.impl.PositionProvider; import org.apache.orc.impl.SettableUncompressedStream; -import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory; +import org.apache.orc.impl.TreeReaderFactory; import org.apache.orc.OrcProto; public class EncodedTreeReaderFactory extends TreeReaderFactory { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java index b20ce2817ece..e4cbd5f2cc9b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java @@ -23,7 +23,6 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.txn.TxnDbUtil; import org.apache.hadoop.hive.ql.io.AcidUtils; -import org.apache.hadoop.hive.ql.io.orc.FileDump; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.txn.AcidHouseKeeperService; @@ -36,7 +35,6 @@ import org.junit.rules.TestName; import java.io.File; -import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.util.ArrayList; import java.util.Arrays; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java index 6c462574fdc0..2fa9ab2e6cbf 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java @@ -20,14 +20,11 @@ import org.junit.Test; -import java.math.BigDecimal; -import java.math.RoundingMode; import java.sql.Timestamp; -import java.util.Date; import java.util.Random; import org.apache.hadoop.hive.common.type.RandomTypeUtil; -import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import static org.junit.Assert.*; @@ -58,7 +55,7 @@ public void testDouble() throws Exception { if (!retrievedTimestamp.equals(randTimestamp)) { assertTrue(false); } - double randDouble = TimestampWritable.getDouble(randTimestamp); + double randDouble = TimestampUtils.getDouble(randTimestamp); double retrievedDouble = timestampColVector.getDouble(i); if (randDouble != retrievedDouble) { assertTrue(false); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java index 1e41fce9ac0a..e7a044efd5a4 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java @@ -33,7 +33,6 @@ import junit.framework.Assert; -import org.apache.hadoop.hive.common.type.Decimal128; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.RandomTypeUtil; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -44,6 +43,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.*; import org.apache.hadoop.hive.ql.exec.vector.expressions.*; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; import org.junit.Test; @@ -91,8 +91,8 @@ public void testCastDoubleToTimestamp() { b.cols[0].noNulls = true; VectorExpression expr = new CastDoubleToTimestamp(0, 1); expr.evaluate(b); - Assert.assertEquals(0.0, TimestampWritable.getDouble(resultV.asScratchTimestamp(3))); - Assert.assertEquals(0.5d, TimestampWritable.getDouble(resultV.asScratchTimestamp(4))); + Assert.assertEquals(0.0, TimestampUtils.getDouble(resultV.asScratchTimestamp(3))); + Assert.assertEquals(0.5d, TimestampUtils.getDouble(resultV.asScratchTimestamp(4))); } @Test @@ -152,7 +152,7 @@ public void testCastTimestampToDouble() { expr.evaluate(b); for (int i = 0; i < doubleValues.length; i++) { double actual = resultV.vector[i]; - double doubleValue = TimestampWritable.getDouble(inV.asScratchTimestamp(i)); + double doubleValue = TimestampUtils.getDouble(inV.asScratchTimestamp(i)); assertEquals(actual, doubleValue, 0.000000001F); } } @@ -382,7 +382,7 @@ public void testCastDecimalToTimestamp() { TimestampColumnVector r = (TimestampColumnVector) b.cols[1]; for (int i = 0; i < doubleValues.length; i++) { Timestamp timestamp = r.asScratchTimestamp(i); - double asDouble = TimestampWritable.getDouble(timestamp); + double asDouble = TimestampUtils.getDouble(timestamp); double expectedDouble = doubleValues[i]; if (expectedDouble != asDouble) { assertTrue(false); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java index a7567b797db8..b78c1f2de5c9 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java @@ -27,8 +27,6 @@ import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; -import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor; -import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc; import org.apache.hadoop.hive.ql.exec.vector.udf.generic.GenericUDFIsNull; import org.apache.hadoop.hive.ql.exec.vector.udf.legacy.ConcatTextLongDoubleUDF; import org.apache.hadoop.hive.ql.exec.vector.udf.legacy.LongUDF; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java index 1a97a6dfff23..c7c2c9d8e183 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java @@ -45,6 +45,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.llap.TypeDesc; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; @@ -537,7 +538,7 @@ public void testTimestamp() throws Exception { Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); int idx = 0; while (rows.hasNext()) { Object row = rows.next(null); @@ -574,7 +575,7 @@ public void testHiveDecimalAllNulls() throws Exception { List fields = readerInspector.getAllStructFieldRefs(); HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector. getStructFieldRef("dec").getFieldObjectInspector(); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); while (rows.hasNext()) { Object row = rows.next(null); assertEquals(null, doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, @@ -617,7 +618,7 @@ public void testHiveDecimalIsNullReset() throws Exception { List fields = readerInspector.getAllStructFieldRefs(); HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector. getStructFieldRef("dec").getFieldObjectInspector(); - RecordReader rows = reader.rows(null); + RecordReader rows = reader.rows(); int idx = 0; while (rows.hasNext()) { Object row = rows.next(null); @@ -1702,6 +1703,11 @@ public void testSeek() throws Exception { RecordReader rows = reader.rows(); OrcStruct row = null; for(int i=COUNT-1; i >= 0; --i) { + // since we are walking backwards, seek back a buffer width so that + // we load the previous buffer of rows + if (i % COUNT == COUNT - 1) { + rows.seekToRow(i - (COUNT - 1)); + } rows.seekToRow(i); row = (OrcStruct) rows.next(row); BigRow expected = createRandomRow(intValues, doubleValues, @@ -1816,6 +1822,11 @@ public void testZeroCopySeek() throws Exception { /* all tests are identical to the other seek() tests */ OrcStruct row = null; for(int i=COUNT-1; i >= 0; --i) { + // since we are walking backwards, seek back a buffer width so that + // we load the previous buffer of rows + if (i % COUNT == COUNT - 1) { + rows.seekToRow(i - (COUNT - 1)); + } rows.seekToRow(i); row = (OrcStruct) rows.next(row); BigRow expected = createRandomRow(intValues, doubleValues, @@ -2067,10 +2078,11 @@ public void testPredicatePushdown() throws Exception { .range(0L, Long.MAX_VALUE) .include(new boolean[]{true, true, true}) .searchArgument(sarg, new String[]{null, "int1", "string1"})); - assertEquals(1000L, rows.getRowNumber()); + assertEquals(0L, rows.getRowNumber()); OrcStruct row = null; for(int i=1000; i < 2000; ++i) { assertTrue(rows.hasNext()); + assertEquals(i, rows.getRowNumber()); row = (OrcStruct) rows.next(row); assertEquals(300 * i, ((IntWritable) row.getFieldValue(0)).get()); assertEquals(Integer.toHexString(10*i), row.getFieldValue(1).toString()); @@ -2088,7 +2100,6 @@ public void testPredicatePushdown() throws Exception { .range(0L, Long.MAX_VALUE) .include(new boolean[]{true, true, true}) .searchArgument(sarg, new String[]{null, "int1", "string1"})); - assertEquals(3500L, rows.getRowNumber()); assertTrue(!rows.hasNext()); // select first 100 and last 100 rows @@ -2154,4 +2165,53 @@ public void testBitPack64Large() throws Exception { Assert.assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); } } + + static class MyList { + List list = new ArrayList<>(); + } + + @Test + public void testListExpansion() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyList.class, + ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).inspector(inspector)); + MyList row = new MyList(); + row.list.add(1); + row.list.add(2); + row.list.add(3); + writer.addRow(row); + row.list.clear(); + writer.addRow(row); + row.list.add(11); + row.list.add(12); + writer.addRow(row); + row.list = null; + writer.addRow(row); + row.list = new ArrayList<>(); + row.list.add(21); + row.list.add(22); + row.list.add(23); + row.list.add(24); + writer.addRow(row); + writer.close(); + RecordReader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf)).rows(); + assertEquals(true, reader.hasNext()); + OrcStruct orcrow = (OrcStruct) reader.next(null); + assertEquals(3, ((List) orcrow.getFieldValue(0)).size()); + orcrow = (OrcStruct) reader.next(row); + assertEquals(0, ((List) orcrow.getFieldValue(0)).size()); + orcrow = (OrcStruct) reader.next(row); + assertEquals(2, ((List) orcrow.getFieldValue(0)).size()); + assertEquals(null, ((OrcStruct) reader.next(row)).getFieldValue(0)); + orcrow = (OrcStruct) reader.next(row); + assertEquals(4, ((List) orcrow.getFieldValue(0)).size()); + assertEquals(false, reader.hasNext()); + reader.close(); + } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java index 973cc409f7da..0a61fb884a8e 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java @@ -40,6 +40,8 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Reporter; +import org.apache.orc.impl.OrcAcidUtils; +import org.apache.orc.tools.FileDump; import org.junit.Test; public class TestOrcRecordUpdater { @@ -115,7 +117,7 @@ public void testWriter() throws Exception { assertEquals(5L, updater.getStats().getRowCount()); Path bucketPath = AcidUtils.createFilename(root, options); - Path sidePath = OrcRecordUpdater.getSideFile(bucketPath); + Path sidePath = OrcAcidUtils.getSideFile(bucketPath); DataInputStream side = fs.open(sidePath); // read the stopping point for the first flush and make sure we only see diff --git a/ql/src/test/results/clientpositive/orc_create.q.out b/ql/src/test/results/clientpositive/orc_create.q.out index 20c3fce599f1..34ab00debfec 100644 --- a/ql/src/test/results/clientpositive/orc_create.q.out +++ b/ql/src/test/results/clientpositive/orc_create.q.out @@ -380,9 +380,9 @@ POSTHOOK: query: SELECT * from orc_create_complex POSTHOOK: type: QUERY POSTHOOK: Input: default@orc_create_complex #### A masked pattern was here #### -line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"A":"one","B":"two"} -line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"A":"three","B":"four"} -line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"A":"five","B":"six"} +line1 {"key13":"value13","key12":"value12","key11":"value11"} ["a","b","c"] {"A":"one","B":"two"} +line2 {"key21":"value21","key23":"value23","key22":"value22"} ["d","e","f"] {"A":"three","B":"four"} +line3 {"key33":"value33","key31":"value31","key32":"value32"} ["g","h","i"] {"A":"five","B":"six"} PREHOOK: query: SELECT str from orc_create_complex PREHOOK: type: QUERY PREHOOK: Input: default@orc_create_complex @@ -402,9 +402,9 @@ POSTHOOK: query: SELECT mp from orc_create_complex POSTHOOK: type: QUERY POSTHOOK: Input: default@orc_create_complex #### A masked pattern was here #### -{"key11":"value11","key12":"value12","key13":"value13"} -{"key21":"value21","key22":"value22","key23":"value23"} -{"key31":"value31","key32":"value32","key33":"value33"} +{"key13":"value13","key12":"value12","key11":"value11"} +{"key21":"value21","key23":"value23","key22":"value22"} +{"key33":"value33","key31":"value31","key32":"value32"} PREHOOK: query: SELECT lst from orc_create_complex PREHOOK: type: QUERY PREHOOK: Input: default@orc_create_complex diff --git a/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out b/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out index 4b7b0b0e89fe..3b2e962f8982 100644 --- a/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out +++ b/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out @@ -126,8 +126,8 @@ POSTHOOK: query: select * from alltypes_orc POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypes_orc #### A masked pattern was here #### -true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"} -false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"} +true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"} +false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"} PREHOOK: query: alter table alltypes_orc change si si int PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@alltypes_orc @@ -144,8 +144,8 @@ POSTHOOK: query: select * from alltypes_orc POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypes_orc #### A masked pattern was here #### -true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"} -false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"} +true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"} +false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"} PREHOOK: query: alter table alltypes_orc change si si bigint PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@alltypes_orc @@ -170,8 +170,8 @@ POSTHOOK: query: select * from alltypes_orc POSTHOOK: type: QUERY POSTHOOK: Input: default@alltypes_orc #### A masked pattern was here #### -true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"} -false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"} +true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"} +false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"} PREHOOK: query: explain select ti, si, i, bi from alltypes_orc PREHOOK: type: QUERY POSTHOOK: query: explain select ti, si, i, bi from alltypes_orc diff --git a/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out b/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out index bd309e69f8f4..e29b357eeff8 100644 --- a/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out +++ b/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out @@ -1719,10 +1719,10 @@ POSTHOOK: Input: default@part_change_various_various_string@part=1 POSTHOOK: Input: default@part_change_various_various_string@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_string @@ -1916,10 +1916,10 @@ POSTHOOK: Input: default@part_change_various_various_char@part=1 POSTHOOK: Input: default@part_change_various_various_char@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_char @@ -2113,10 +2113,10 @@ POSTHOOK: Input: default@part_change_various_various_char_trunc@part=1 POSTHOOK: Input: default@part_change_various_various_char_trunc@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original -2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original -3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original -4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original +1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original +2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original +3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original +4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original 5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new 6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new PREHOOK: query: drop table part_change_various_various_char_trunc @@ -2310,10 +2310,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar@part=1 POSTHOOK: Input: default@part_change_various_various_varchar@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_varchar @@ -2507,10 +2507,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=1 POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original -2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original -3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original -4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original +1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original +2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original +3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original +4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original 5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new 6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new PREHOOK: query: drop table part_change_various_various_varchar_trunc diff --git a/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out b/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out index f9f4d0bfd080..3721f5b58bee 100644 --- a/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out +++ b/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out @@ -1539,10 +1539,10 @@ POSTHOOK: Input: default@part_change_various_various_string@part=1 POSTHOOK: Input: default@part_change_various_various_string@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_string @@ -1716,10 +1716,10 @@ POSTHOOK: Input: default@part_change_various_various_char@part=1 POSTHOOK: Input: default@part_change_various_various_char@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_char @@ -1893,10 +1893,10 @@ POSTHOOK: Input: default@part_change_various_various_char_trunc@part=1 POSTHOOK: Input: default@part_change_various_various_char_trunc@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original -2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original -3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original -4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original +1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original +2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original +3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original +4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original 5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new 6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new PREHOOK: query: drop table part_change_various_various_char_trunc @@ -2070,10 +2070,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar@part=1 POSTHOOK: Input: default@part_change_various_various_varchar@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original -2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original -3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original -4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original +1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original +2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original +3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original +4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original 5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new 6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new PREHOOK: query: drop table part_change_various_various_varchar @@ -2247,10 +2247,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=1 POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=2 #### A masked pattern was here #### insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b -1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original -2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original -3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original -4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original +1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original +2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original +3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original +4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original 5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new 6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new PREHOOK: query: drop table part_change_various_various_varchar_trunc diff --git a/ql/src/test/results/clientpositive/vector_complex_all.q.out b/ql/src/test/results/clientpositive/vector_complex_all.q.out index 1af37c3866ee..2ae7c1bbb3d4 100644 --- a/ql/src/test/results/clientpositive/vector_complex_all.q.out +++ b/ql/src/test/results/clientpositive/vector_complex_all.q.out @@ -108,9 +108,9 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@orc_create_complex #### A masked pattern was here #### orc_create_complex.str orc_create_complex.mp orc_create_complex.lst orc_create_complex.strct -line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"a":"one","b":"two"} -line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"a":"three","b":"four"} -line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"a":"five","b":"six"} +line1 {"key13":"value13","key12":"value12","key11":"value11"} ["a","b","c"] {"a":"one","b":"two"} +line2 {"key21":"value21","key23":"value23","key22":"value22"} ["d","e","f"] {"a":"three","b":"four"} +line3 {"key33":"value33","key31":"value31","key32":"value32"} ["g","h","i"] {"a":"five","b":"six"} PREHOOK: query: -- However, since this query is not referencing the complex fields, it should vectorize. EXPLAIN SELECT COUNT(*) FROM orc_create_complex diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java b/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java index 305fdbefb030..7d136b48d72a 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java @@ -21,13 +21,13 @@ import java.io.DataOutput; import java.io.IOException; import java.io.OutputStream; -import java.math.BigDecimal; import java.sql.Timestamp; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt; @@ -61,7 +61,6 @@ public class TimestampWritable implements WritableComparable private static final long SEVEN_BYTE_LONG_SIGN_FLIP = 0xff80L << 48; - private static final BigDecimal BILLION_BIG_DECIMAL = BigDecimal.valueOf(1000000000); /** The maximum number of bytes required for a TimestampWritable */ public static final int MAX_BYTES = 13; @@ -181,7 +180,7 @@ public void writeToByteStream(RandomAccessOutput byteStream) { */ public long getSeconds() { if (!timestampEmpty) { - return millisToSeconds(timestamp.getTime()); + return TimestampUtils.millisToSeconds(timestamp.getTime()); } else if (!bytesEmpty) { return TimestampWritable.getSeconds(currentBytes, offset); } else { @@ -313,7 +312,7 @@ private void checkBytes() { public double getDouble() { double seconds, nanos; if (bytesEmpty) { - seconds = millisToSeconds(timestamp.getTime()); + seconds = TimestampUtils.millisToSeconds(timestamp.getTime()); nanos = timestamp.getNanos(); } else { seconds = getSeconds(); @@ -326,17 +325,6 @@ public static long getLong(Timestamp timestamp) { return timestamp.getTime() / 1000; } - /** - * - * @return double representation of the timestamp, accurate to nanoseconds - */ - public static double getDouble(Timestamp timestamp) { - double seconds, nanos; - seconds = millisToSeconds(timestamp.getTime()); - nanos = timestamp.getNanos(); - return seconds + nanos / 1000000000; - } - public void readFields(DataInput in) throws IOException { in.readFully(internalBytes, 0, 4); if (TimestampWritable.hasDecimalOrSecondVInt(internalBytes[0])) { @@ -493,7 +481,7 @@ public static void convertTimestampToBytes(Timestamp t, byte[] b, long millis = t.getTime(); int nanos = t.getNanos(); - long seconds = millisToSeconds(millis); + long seconds = TimestampUtils.millisToSeconds(millis); boolean hasSecondVInt = seconds < 0 || seconds > Integer.MAX_VALUE; boolean hasDecimal = setNanosBytes(nanos, b, offset+4, hasSecondVInt); @@ -541,20 +529,6 @@ private static boolean setNanosBytes(int nanos, byte[] b, int offset, boolean ha return decimal != 0; } - public static Timestamp decimalToTimestamp(HiveDecimal d) { - BigDecimal nanoInstant = d.bigDecimalValue().multiply(BILLION_BIG_DECIMAL); - int nanos = nanoInstant.remainder(BILLION_BIG_DECIMAL).intValue(); - if (nanos < 0) { - nanos += 1000000000; - } - long seconds = - nanoInstant.subtract(new BigDecimal(nanos)).divide(BILLION_BIG_DECIMAL).longValue(); - Timestamp t = new Timestamp(seconds * 1000); - t.setNanos(nanos); - - return t; - } - public HiveDecimal getHiveDecimal() { if (timestampEmpty) { populateTimestamp(); @@ -565,11 +539,12 @@ public HiveDecimal getHiveDecimal() { public static HiveDecimal getHiveDecimal(Timestamp timestamp) { // The BigDecimal class recommends not converting directly from double to BigDecimal, // so we convert through a string... - Double timestampDouble = TimestampWritable.getDouble(timestamp); + Double timestampDouble = TimestampUtils.getDouble(timestamp); HiveDecimal result = HiveDecimal.create(timestampDouble.toString()); return result; } + /** * Converts the time in seconds or milliseconds to a timestamp. * @param time time in seconds or in milliseconds @@ -580,71 +555,6 @@ public static Timestamp longToTimestamp(long time, boolean intToTimestampInSecon return new Timestamp(intToTimestampInSeconds ? time * 1000 : time); } - /** - * Converts the time in seconds or milliseconds to a timestamp. - * @param time time in seconds or in milliseconds - * @return the timestamp - */ - public static void setTimestampFromLong(Timestamp timestamp, long time, - boolean intToTimestampInSeconds) { - // If the time is in seconds, converts it to milliseconds first. - timestamp.setTime(intToTimestampInSeconds ? time * 1000 : time); - } - - public static Timestamp doubleToTimestamp(double f) { - long seconds = (long) f; - - // We must ensure the exactness of the double's fractional portion. - // 0.6 as the fraction part will be converted to 0.59999... and - // significantly reduce the savings from binary serialization - BigDecimal bd = new BigDecimal(String.valueOf(f)); - bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000)); - int nanos = bd.intValue(); - - // Convert to millis - long millis = seconds * 1000; - if (nanos < 0) { - millis -= 1000; - nanos += 1000000000; - } - Timestamp t = new Timestamp(millis); - - // Set remaining fractional portion to nanos - t.setNanos(nanos); - return t; - } - - public static void setTimestampFromDouble(Timestamp timestamp, double f) { - // Otherwise, BigDecimal throws an exception. (Support vector operations that sometimes - // do work on double Not-a-Number NaN values). - if (Double.isNaN(f)) { - timestamp.setTime(0); - return; - } - // Algorithm used by TimestampWritable.doubleToTimestamp method. - // Allocates a BigDecimal object! - - long seconds = (long) f; - - // We must ensure the exactness of the double's fractional portion. - // 0.6 as the fraction part will be converted to 0.59999... and - // significantly reduce the savings from binary serialization - BigDecimal bd = new BigDecimal(String.valueOf(f)); - bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000)); - int nanos = bd.intValue(); - - // Convert to millis - long millis = seconds * 1000; - if (nanos < 0) { - millis -= 1000; - nanos += 1000000000; - } - timestamp.setTime(millis); - - // Set remaining fractional portion to nanos - timestamp.setNanos(nanos); - } - public static void setTimestamp(Timestamp t, byte[] bytes, int offset) { boolean hasDecimalOrSecondVInt = hasDecimalOrSecondVInt(bytes[offset]); long seconds = (long) TimestampWritable.getSeconds(bytes, offset); @@ -737,16 +647,4 @@ static long readSevenByteLong(byte[] bytes, int offset) { | ((0xFFL & bytes[offset+5]) << 16) | ((0xFFL & bytes[offset+6]) << 8)) >> 8; } - - /** - * Rounds the number of milliseconds relative to the epoch down to the nearest whole number of - * seconds. 500 would round to 0, -500 would round to -1. - */ - public static long millisToSeconds(long millis) { - if (millis >= 0) { - return millis / 1000; - } else { - return (millis - 999) / 1000; - } - } } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java index 932ae0baab99..6415bf80768a 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java @@ -27,6 +27,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.type.HiveChar; @@ -1088,13 +1089,13 @@ public static Timestamp getTimestamp(Object o, PrimitiveObjectInspector inputOI, result = TimestampWritable.longToTimestamp(longValue, intToTimestampInSeconds); break; case FLOAT: - result = TimestampWritable.doubleToTimestamp(((FloatObjectInspector) inputOI).get(o)); + result = TimestampUtils.doubleToTimestamp(((FloatObjectInspector) inputOI).get(o)); break; case DOUBLE: - result = TimestampWritable.doubleToTimestamp(((DoubleObjectInspector) inputOI).get(o)); + result = TimestampUtils.doubleToTimestamp(((DoubleObjectInspector) inputOI).get(o)); break; case DECIMAL: - result = TimestampWritable.decimalToTimestamp(((HiveDecimalObjectInspector) inputOI) + result = TimestampUtils.decimalToTimestamp(((HiveDecimalObjectInspector) inputOI) .getPrimitiveJavaObject(o)); break; case STRING: diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java b/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java index 6c763bcafdb3..7619efad3459 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java @@ -35,6 +35,7 @@ import java.util.Random; import java.util.TimeZone; +import org.apache.hadoop.hive.ql.util.TimestampUtils; import org.junit.*; import static org.junit.Assert.*; @@ -70,7 +71,7 @@ private static long getSeconds(Timestamp ts) { long seconds = (ts.getTime() - ts.getNanos() / 1000000) / 1000; // It should also be possible to calculate this based on ts.getTime() only. - assertEquals(seconds, TimestampWritable.millisToSeconds(ts.getTime())); + assertEquals(seconds, TimestampUtils.millisToSeconds(ts.getTime())); return seconds; } @@ -335,10 +336,10 @@ public void testToFromDouble() { Math.pow(10, 9 - nanosPrecision)); assertEquals(String.format("Invalid nanosecond part recovered from %f", asDouble), nanos, recoveredNanos); - assertEquals(ts, TimestampWritable.doubleToTimestamp(asDouble)); + assertEquals(ts, TimestampUtils.doubleToTimestamp(asDouble)); // decimalToTimestamp should be consistent with doubleToTimestamp for this level of // precision. - assertEquals(ts, TimestampWritable.decimalToTimestamp( + assertEquals(ts, TimestampUtils.decimalToTimestamp( HiveDecimal.create(BigDecimal.valueOf(asDouble)))); } } @@ -358,7 +359,7 @@ public void testDecimalToTimestampRandomly() { Timestamp ts = new Timestamp( randomMillis(MIN_FOUR_DIGIT_YEAR_MILLIS, MAX_FOUR_DIGIT_YEAR_MILLIS, rand)); ts.setNanos(randomNanos(rand, 9)); // full precision - assertEquals(ts, TimestampWritable.decimalToTimestamp(timestampToDecimal(ts))); + assertEquals(ts, TimestampUtils.decimalToTimestamp(timestampToDecimal(ts))); } } @@ -371,8 +372,8 @@ public void testDecimalToTimestampCornerCases() { for (int nanos : new int[] { 100000, 900000, 999100000, 999900000 }) { ts.setNanos(nanos); HiveDecimal d = timestampToDecimal(ts); - assertEquals(ts, TimestampWritable.decimalToTimestamp(d)); - assertEquals(ts, TimestampWritable.doubleToTimestamp(d.bigDecimalValue().doubleValue())); + assertEquals(ts, TimestampUtils.decimalToTimestamp(d)); + assertEquals(ts, TimestampUtils.doubleToTimestamp(d.bigDecimalValue().doubleValue())); } } @@ -435,20 +436,20 @@ public void testMaxSize() { @Concurrent(count=4) @Repeating(repetition=100) public void testMillisToSeconds() { - assertEquals(0, TimestampWritable.millisToSeconds(0)); - assertEquals(-1, TimestampWritable.millisToSeconds(-1)); - assertEquals(-1, TimestampWritable.millisToSeconds(-999)); - assertEquals(-1, TimestampWritable.millisToSeconds(-1000)); - assertEquals(-2, TimestampWritable.millisToSeconds(-1001)); - assertEquals(-2, TimestampWritable.millisToSeconds(-1999)); - assertEquals(-2, TimestampWritable.millisToSeconds(-2000)); - assertEquals(-3, TimestampWritable.millisToSeconds(-2001)); - assertEquals(-99, TimestampWritable.millisToSeconds(-99000)); - assertEquals(-100, TimestampWritable.millisToSeconds(-99001)); - assertEquals(-100, TimestampWritable.millisToSeconds(-100000)); - assertEquals(1, TimestampWritable.millisToSeconds(1500)); - assertEquals(19, TimestampWritable.millisToSeconds(19999)); - assertEquals(20, TimestampWritable.millisToSeconds(20000)); + assertEquals(0, TimestampUtils.millisToSeconds(0)); + assertEquals(-1, TimestampUtils.millisToSeconds(-1)); + assertEquals(-1, TimestampUtils.millisToSeconds(-999)); + assertEquals(-1, TimestampUtils.millisToSeconds(-1000)); + assertEquals(-2, TimestampUtils.millisToSeconds(-1001)); + assertEquals(-2, TimestampUtils .millisToSeconds(-1999)); + assertEquals(-2, TimestampUtils .millisToSeconds(-2000)); + assertEquals(-3, TimestampUtils .millisToSeconds(-2001)); + assertEquals(-99, TimestampUtils .millisToSeconds(-99000)); + assertEquals(-100, TimestampUtils .millisToSeconds(-99001)); + assertEquals(-100, TimestampUtils .millisToSeconds(-100000)); + assertEquals(1, TimestampUtils .millisToSeconds(1500)); + assertEquals(19, TimestampUtils .millisToSeconds(19999)); + assertEquals(20, TimestampUtils .millisToSeconds(20000)); } private static int compareEqualLengthByteArrays(byte[] a, byte[] b) { diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java index 2b7c747f8b89..ef2b7f79632e 100644 --- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java +++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java @@ -102,42 +102,18 @@ public class Hadoop23Shims extends HadoopShimsSecure { HadoopShims.MiniDFSShim cluster = null; - final boolean zeroCopy; final boolean storagePolicy; - final boolean fastread; public Hadoop23Shims() { - boolean zcr = false; + // in-memory HDFS boolean storage = false; - boolean fastread = false; try { - Class.forName("org.apache.hadoop.fs.CacheFlag", false, - ShimLoader.class.getClassLoader()); - zcr = true; - } catch (ClassNotFoundException ce) { - } - - if (zcr) { - // in-memory HDFS is only available after zcr - try { - Class.forName("org.apache.hadoop.hdfs.protocol.BlockStoragePolicy", + Class.forName("org.apache.hadoop.hdfs.protocol.BlockStoragePolicy", false, ShimLoader.class.getClassLoader()); - storage = true; - } catch (ClassNotFoundException ce) { - } - } - - if (storage) { - for (Method m : Text.class.getMethods()) { - if ("readWithKnownLength".equals(m.getName())) { - fastread = true; - } - } + storage = true; + } catch (ClassNotFoundException ce) { } - this.storagePolicy = storage; - this.zeroCopy = zcr; - this.fastread = fastread; } @Override @@ -853,15 +829,6 @@ public FileSystem createProxyFileSystem(FileSystem fs, URI uri) { return new ProxyFileSystem23(fs, uri); } - @Override - public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException { - if(zeroCopy) { - return ZeroCopyShims.getZeroCopyReader(in, pool); - } - /* not supported */ - return null; - } - @Override public Configuration getConfiguration(org.apache.hadoop.mapreduce.JobContext context) { return context.getConfiguration(); @@ -1302,26 +1269,4 @@ public void addDelegationTokens(FileSystem fs, Credentials cred, String uname) t public long getFileId(FileSystem fs, String path) throws IOException { return ensureDfs(fs).getClient().getFileInfo(path).getFileId(); } - - private final class FastTextReaderShim implements TextReaderShim { - private final DataInputStream din; - - public FastTextReaderShim(InputStream in) { - this.din = new DataInputStream(in); - } - - @Override - public void read(Text txt, int len) throws IOException { - txt.readWithKnownLength(din, len); - } - } - - @Override - public TextReaderShim getTextReaderShim(InputStream in) throws IOException { - if (!fastread) { - return super.getTextReaderShim(in); - } - return new FastTextReaderShim(in); - } - } diff --git a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java index 37eb8f66360a..4a96355a93a1 100644 --- a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java +++ b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java @@ -402,57 +402,6 @@ public interface StoragePolicyShim { */ public StoragePolicyShim getStoragePolicyShim(FileSystem fs); - /** - * a hadoop.io ByteBufferPool shim. - */ - public interface ByteBufferPoolShim { - /** - * Get a new ByteBuffer from the pool. The pool can provide this from - * removing a buffer from its internal cache, or by allocating a - * new buffer. - * - * @param direct Whether the buffer should be direct. - * @param length The minimum length the buffer will have. - * @return A new ByteBuffer. Its capacity can be less - * than what was requested, but must be at - * least 1 byte. - */ - ByteBuffer getBuffer(boolean direct, int length); - - /** - * Release a buffer back to the pool. - * The pool may choose to put this buffer into its cache/free it. - * - * @param buffer a direct bytebuffer - */ - void putBuffer(ByteBuffer buffer); - } - - /** - * Provides an HDFS ZeroCopyReader shim. - * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to) - * @param in ByteBufferPoolShim to allocate fallback buffers with - * - * @return returns null if not supported - */ - public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException; - - public interface ZeroCopyReaderShim { - /** - * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer. - * Also move the in stream by that amount. The data read can be small than maxLength. - * - * @return ByteBuffer read from the stream, - */ - public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException; - /** - * Release a ByteBuffer obtained from a read on the - * Also move the in stream by that amount. The data read can be small than maxLength. - * - */ - public void releaseBuffer(ByteBuffer buffer); - } - /** * Get configuration from JobContext */ @@ -692,23 +641,4 @@ public List getKeys() throws IOException{ */ long getFileId(FileSystem fs, String path) throws IOException; - /** - * Read data into a Text object in the fastest way possible - */ - public interface TextReaderShim { - /** - * @param txt - * @param len - * @return bytes read - * @throws IOException - */ - void read(Text txt, int size) throws IOException; - } - - /** - * Wrap a TextReaderShim around an input stream. The reader shim will not - * buffer any reads from the underlying stream and will only consume bytes - * which are required for TextReaderShim.read() input. - */ - public TextReaderShim getTextReaderShim(InputStream input) throws IOException; } diff --git a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java index 87682e6e5300..224ce3b0366e 100644 --- a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java +++ b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java @@ -392,33 +392,4 @@ public void checkFileAccess(FileSystem fs, FileStatus stat, FsAction action) @Override abstract public void addDelegationTokens(FileSystem fs, Credentials cred, String uname) throws IOException; - - private final class BasicTextReaderShim implements TextReaderShim { - private final InputStream in; - - public BasicTextReaderShim(InputStream in) { - this.in = in; - } - - @Override - public void read(Text txt, int len) throws IOException { - int offset = 0; - byte[] bytes = new byte[len]; - while (len > 0) { - int written = in.read(bytes, offset, len); - if (written < 0) { - throw new EOFException("Can't finish read from " + in + " read " - + (offset) + " bytes out of " + bytes.length); - } - len -= written; - offset += written; - } - txt.set(bytes); - } - } - - @Override - public TextReaderShim getTextReaderShim(InputStream in) throws IOException { - return new BasicTextReaderShim(in); - } } diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java index d9713396e47e..228461af1445 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java @@ -281,8 +281,13 @@ public void flatten(boolean selectedInUse, int[] sel, int size) { * @param timestamp */ public void set(int elementNum, Timestamp timestamp) { - this.time[elementNum] = timestamp.getTime(); - this.nanos[elementNum] = timestamp.getNanos(); + if (timestamp == null) { + this.noNulls = false; + this.isNull[elementNum] = true; + } else { + this.time[elementNum] = timestamp.getTime(); + this.nanos[elementNum] = timestamp.getNanos(); + } } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java similarity index 100% rename from ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java rename to storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java index 8c5bab25df31..10d8c51f86ba 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -32,11 +32,11 @@ import java.util.Set; /** - * The implementation of SearchArguments. + * The implementation of SearchArguments. Visible for testing only. */ -final class SearchArgumentImpl implements SearchArgument { +public final class SearchArgumentImpl implements SearchArgument { - static final class PredicateLeafImpl implements PredicateLeaf { + public static final class PredicateLeafImpl implements PredicateLeaf { private final Operator operator; private final Type type; private String columnName; @@ -53,11 +53,11 @@ static final class PredicateLeafImpl implements PredicateLeaf { literalList = null; } - PredicateLeafImpl(Operator operator, - Type type, - String columnName, - Object literal, - List literalList) { + public PredicateLeafImpl(Operator operator, + Type type, + String columnName, + Object literal, + List literalList) { this.operator = operator; this.type = type; this.columnName = columnName; diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java b/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java new file mode 100644 index 000000000000..189ead565034 --- /dev/null +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +import org.apache.hadoop.hive.common.type.HiveDecimal; + +import java.math.BigDecimal; +import java.sql.Timestamp; + +/** + * Utitilities for Timestamps and the relevant conversions. + */ +public class TimestampUtils { + public static final BigDecimal BILLION_BIG_DECIMAL = BigDecimal.valueOf(1000000000); + + /** + * Convert the timestamp to a double measured in seconds. + * @return double representation of the timestamp, accurate to nanoseconds + */ + public static double getDouble(Timestamp ts) { + long seconds = millisToSeconds(ts.getTime()); + return seconds + ((double) ts.getNanos()) / 1000000000; + } + + public static Timestamp doubleToTimestamp(double f) { + long seconds = (long) f; + + // We must ensure the exactness of the double's fractional portion. + // 0.6 as the fraction part will be converted to 0.59999... and + // significantly reduce the savings from binary serialization + BigDecimal bd; + try { + bd = new BigDecimal(String.valueOf(f)); + } catch (NumberFormatException nfe) { + return null; + } + bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000)); + int nanos = bd.intValue(); + + // Convert to millis + long millis = seconds * 1000; + if (nanos < 0) { + millis -= 1000; + nanos += 1000000000; + } + Timestamp t = new Timestamp(millis); + + // Set remaining fractional portion to nanos + t.setNanos(nanos); + return t; + } + + public static Timestamp decimalToTimestamp(HiveDecimal d) { + BigDecimal nanoInstant = d.bigDecimalValue().multiply(BILLION_BIG_DECIMAL); + int nanos = nanoInstant.remainder(BILLION_BIG_DECIMAL).intValue(); + if (nanos < 0) { + nanos += 1000000000; + } + long seconds = + nanoInstant.subtract(new BigDecimal(nanos)).divide(BILLION_BIG_DECIMAL).longValue(); + Timestamp t = new Timestamp(seconds * 1000); + t.setNanos(nanos); + + return t; + } + + /** + * Rounds the number of milliseconds relative to the epoch down to the nearest whole number of + * seconds. 500 would round to 0, -500 would round to -1. + */ + public static long millisToSeconds(long millis) { + if (millis >= 0) { + return millis / 1000; + } else { + return (millis - 999) / 1000; + } + } +}