From ecf4d3ebfdab99f71daf5828f911b857a2d4c15c Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Mon, 16 Jan 2017 15:46:55 -0800 Subject: [PATCH 01/21] Preparing for a 1.3 release. Signed-off-by: Owen O'Malley --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ecf7fe13e9..2fee92ab7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT") +SET(CPACK_PACKAGE_VERSION_PATCH "0") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index 0cf79f5f8a..93b2a8bf73 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0-SNAPSHOT + 1.3.0 ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 93553b1be3..15ba425d32 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0-SNAPSHOT + 1.3.0 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 0d9dd303f9..6f0246e3ad 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.0-SNAPSHOT + 1.3.0 pom Apache ORC @@ -234,17 +234,17 @@ org.apache.orc orc-core - 1.3.0-SNAPSHOT + 1.3.0 org.apache.orc orc-mapreduce - 1.3.0-SNAPSHOT + 1.3.0 org.apache.orc orc-tools - 1.3.0-SNAPSHOT + 1.3.0 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index e8a0c0ac7d..9b49682083 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0-SNAPSHOT + 1.3.0 ../pom.xml From 82132e83ae256acbaa1a7e792abe38c50f5e941c Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 17 Jan 2017 14:04:04 -0800 Subject: [PATCH 02/21] ORC-131. Fix getRawDataSize() in WriterImpl. (omalley) Fixes #83 Signed-off-by: Owen O'Malley --- java/core/src/java/org/apache/orc/impl/WriterImpl.java | 1 + java/tools/src/test/org/apache/orc/tools/TestFileDump.java | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index c364ca0027..d9140e806c 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -2748,6 +2748,7 @@ private long writeFooter() throws IOException { OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder(); builder.setNumberOfRows(rowCount); builder.setRowIndexStride(rowIndexStride); + rawDataSize = computeRawDataSize(); // serialize the types writeTypes(builder, schema); // add the stripe information diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java index 65ff404ec3..1556ab4805 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java @@ -238,6 +238,8 @@ public void testDump() throws Exception { writer.addRowBatch(batch); } writer.close(); + assertEquals(2079000, writer.getRawDataSize()); + assertEquals(21000, writer.getNumberOfRows()); PrintStream origOut = System.out; String outputFilename = "orc-file-dump.out"; FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); @@ -306,6 +308,8 @@ public void testDataDump() throws Exception { writer.addRowBatch(batch); writer.close(); + assertEquals(1564, 0, writer.getRawDataSize()); + assertEquals(2, writer.getNumberOfRows()); PrintStream origOut = System.out; ByteArrayOutputStream myOut = new ByteArrayOutputStream(); From b9cc166b3396ca0d95b811a11ac2037fed2b1059 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Wed, 18 Jan 2017 16:55:17 -0800 Subject: [PATCH 03/21] ORC-132. Implement a merge file method and fix the number of rows written during merge. (omalley) Fixes #84 Signed-off-by: Owen O'Malley --- .../core/src/java/org/apache/orc/OrcFile.java | 230 +++++++++++++++++- java/core/src/java/org/apache/orc/Writer.java | 3 +- .../org/apache/orc/impl/PhysicalFsWriter.java | 3 - .../java/org/apache/orc/impl/ReaderImpl.java | 16 +- .../java/org/apache/orc/impl/WriterImpl.java | 20 +- .../org/apache/orc/TestVectorOrcFile.java | 170 +++++++++++++ 6 files changed, 428 insertions(+), 14 deletions(-) diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index 68e49f3d2c..cfabba9b87 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -19,20 +19,29 @@ package org.apache.orc; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Properties; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.impl.MemoryManager; import org.apache.orc.impl.OrcTail; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.WriterImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Contains factory methods to read or write ORC files. */ public class OrcFile { + private static final Logger LOG = LoggerFactory.getLogger(OrcFile.class); public static final String MAGIC = "ORC"; /** @@ -52,7 +61,8 @@ public class OrcFile { */ public enum Version { V_0_11("0.11", 0, 11), - V_0_12("0.12", 0, 12); + V_0_12("0.12", 0, 12), + FUTURE("future", Integer.MAX_VALUE, Integer.MAX_VALUE); public static final Version CURRENT = V_0_12; @@ -248,7 +258,7 @@ public interface WriterCallback { void preFooterWrite(WriterContext context) throws IOException; } - public static enum BloomFilterVersion { + public enum BloomFilterVersion { // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support // both old and new readers. ORIGINAL("original"), @@ -257,7 +267,7 @@ public static enum BloomFilterVersion { UTF8("utf8"); private final String id; - private BloomFilterVersion(String id) { + BloomFilterVersion(String id) { this.id = id; } @@ -299,6 +309,7 @@ public static class WriterOptions { private double bloomFilterFpp; private BloomFilterVersion bloomFilterVersion; private PhysicalWriter physicalWriter; + private WriterVersion writerVersion = CURRENT_WRITER; protected WriterOptions(Properties tableProperties, Configuration conf) { configuration = conf; @@ -508,6 +519,20 @@ protected WriterOptions memory(MemoryManager value) { return this; } + /** + * Manually set the writer version. + * This is an internal API. + * @param version the version to write + * @return this + */ + protected WriterOptions writerVersion(WriterVersion version) { + if (version == WriterVersion.FUTURE) { + throw new IllegalArgumentException("Can't write a future version."); + } + this.writerVersion = version; + return this; + } + public boolean getBlockPadding() { return blockPaddingValue; } @@ -587,6 +612,10 @@ public BloomFilterVersion getBloomFilterVersion() { public PhysicalWriter getPhysicalWriter() { return physicalWriter; } + + public WriterVersion getWriterVersion() { + return writerVersion; + } } /** @@ -642,4 +671,199 @@ public static Writer createWriter(Path path, return new WriterImpl(fs, path, opts); } + /** + * Do we understand the version in the reader? + * @param path the path of the file + * @param reader the ORC file reader + * @return is the version understood by this writer? + */ + static boolean understandFormat(Path path, Reader reader) { + if (reader.getFileVersion() == Version.FUTURE) { + LOG.info("Can't merge {} because it has a future version.", path); + return false; + } + if (reader.getWriterVersion() == WriterVersion.FUTURE) { + LOG.info("Can't merge {} because it has a future writerVersion.", path); + return false; + } + return true; + } + + /** + * Is the new reader compatible with the file that is being written? + * @param schema the writer schema + * @param fileVersion the writer fileVersion + * @param writerVersion the writer writerVersion + * @param rowIndexStride the row index stride + * @param compression the compression that was used + * @param userMetadata the user metadata + * @param path the new path name for warning messages + * @param reader the new reader + * @return is the reader compatible with the previous ones? + */ + static boolean readerIsCompatible(TypeDescription schema, + Version fileVersion, + WriterVersion writerVersion, + int rowIndexStride, + CompressionKind compression, + Map userMetadata, + Path path, + Reader reader) { + // now we have to check compatibility + if (!reader.getSchema().equals(schema)) { + LOG.info("Can't merge {} because of different schemas {} vs {}", + path, reader.getSchema(), schema); + return false; + } + if (reader.getCompressionKind() != compression) { + LOG.info("Can't merge {} because of different compression {} vs {}", + path, reader.getCompressionKind(), compression); + return false; + } + if (reader.getFileVersion() != fileVersion) { + LOG.info("Can't merge {} because of different file versions {} vs {}", + path, reader.getFileVersion(), fileVersion); + return false; + } + if (reader.getWriterVersion() != writerVersion) { + LOG.info("Can't merge {} because of different writer versions {} vs {}", + path, reader.getFileVersion(), fileVersion); + return false; + } + if (reader.getRowIndexStride() != rowIndexStride) { + LOG.info("Can't merge {} because of different row index strides {} vs {}", + path, reader.getRowIndexStride(), rowIndexStride); + return false; + } + for(String key: reader.getMetadataKeys()) { + if (userMetadata.containsKey(key)) { + ByteBuffer currentValue = userMetadata.get(key); + ByteBuffer newValue = reader.getMetadataValue(key); + if (!newValue.equals(currentValue)) { + LOG.info("Can't merge {} because of different user metadata {}", path, + key); + return false; + } + } + } + return true; + } + + static void mergeMetadata(Map metadata, + Reader reader) { + for(String key: reader.getMetadataKeys()) { + metadata.put(key, reader.getMetadataValue(key)); + } + } + + /** + * Merges multiple ORC files that all have the same schema to produce + * a single ORC file. + * The merge will reject files that aren't compatible with the merged file + * so the output list may be shorter than the input list. + * The stripes are copied as serialized byte buffers. + * The user metadata are merged and files that disagree on the value + * associated with a key will be rejected. + * + * @param outputPath the output file + * @param options the options for writing with although the options related + * to the input files' encodings are overridden + * @param inputFiles the list of files to merge + * @return the list of files that were successfully merged + * @throws IOException + */ + public static List mergeFiles(Path outputPath, + WriterOptions options, + List inputFiles) throws IOException { + Writer output = null; + final Configuration conf = options.getConfiguration(); + try { + byte[] buffer = new byte[0]; + TypeDescription schema = null; + CompressionKind compression = null; + int bufferSize = 0; + Version fileVersion = null; + WriterVersion writerVersion = null; + int rowIndexStride = 0; + List result = new ArrayList<>(inputFiles.size()); + Map userMetadata = new HashMap<>(); + + for (Path input : inputFiles) { + FileSystem fs = input.getFileSystem(conf); + Reader reader = createReader(input, + readerOptions(options.getConfiguration()).filesystem(fs)); + + if (!understandFormat(input, reader)) { + continue; + } else if (schema == null) { + // if this is the first file that we are including, grab the values + schema = reader.getSchema(); + compression = reader.getCompressionKind(); + bufferSize = reader.getCompressionSize(); + rowIndexStride = reader.getRowIndexStride(); + fileVersion = reader.getFileVersion(); + writerVersion = reader.getWriterVersion(); + options.blockSize(bufferSize) + .version(fileVersion) + .writerVersion(writerVersion) + .compress(compression) + .rowIndexStride(rowIndexStride) + .setSchema(schema); + if (compression != CompressionKind.NONE) { + options.enforceBufferSize().bufferSize(bufferSize); + } + mergeMetadata(userMetadata, reader); + output = createWriter(outputPath, options); + } else if (!readerIsCompatible(schema, fileVersion, writerVersion, + rowIndexStride, compression, userMetadata, input, reader)) { + continue; + } else { + mergeMetadata(userMetadata, reader); + if (bufferSize < reader.getCompressionSize()) { + bufferSize = reader.getCompressionSize(); + ((WriterImpl) output).increaseCompressionSize(bufferSize); + } + } + List statList = + reader.getOrcProtoStripeStatistics(); + try (FSDataInputStream inputStream = fs.open(input)) { + int stripeNum = 0; + result.add(input); + + for (StripeInformation stripe : reader.getStripes()) { + int length = (int) stripe.getLength(); + if (buffer.length < length) { + buffer = new byte[length]; + } + long offset = stripe.getOffset(); + inputStream.readFully(offset, buffer, 0, length); + output.appendStripe(buffer, 0, length, stripe, statList.get(stripeNum++)); + } + } + } + if (output != null) { + for (Map.Entry entry : userMetadata.entrySet()) { + output.addUserMetadata(entry.getKey(), entry.getValue()); + } + output.close(); + } + return result; + } catch (IOException ioe) { + if (output != null) { + try { + output.close(); + } catch (Throwable t) { + // PASS + } + try { + FileSystem fs = options.getFileSystem() == null ? + outputPath.getFileSystem(conf) : options.getFileSystem(); + fs.delete(outputPath, false); + } catch (Throwable t) { + // PASS + } + } + throw ioe; + } + } } diff --git a/java/core/src/java/org/apache/orc/Writer.java b/java/core/src/java/org/apache/orc/Writer.java index 596e14ed4f..b496594b91 100644 --- a/java/core/src/java/org/apache/orc/Writer.java +++ b/java/core/src/java/org/apache/orc/Writer.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -31,7 +32,7 @@ /** * The interface for writing ORC files. */ -public interface Writer { +public interface Writer extends Closeable { /** * Get the schema for this writer diff --git a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java index 48a0b4267c..17c73ffe23 100644 --- a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java +++ b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java @@ -204,9 +204,6 @@ public void writeFileFooter(OrcProto.Footer.Builder builder) throws IOException public long writePostScript(OrcProto.PostScript.Builder builder) throws IOException { builder.setFooterLength(footerLength); builder.setMetadataLength(metadataLength); - if (compress != CompressionKind.NONE) { - builder.setCompressionBlockSize(bufferSize); - } OrcProto.PostScript ps = builder.build(); // need to write this uncompressed long startPosn = rawWriter.getPos(); diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index c24920d62b..9c8c06bbc0 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -187,16 +187,22 @@ public List getTypes() { return types; } - @Override - public OrcFile.Version getFileVersion() { + public static OrcFile.Version getFileVersion(List versionList) { + if (versionList == null || versionList.isEmpty()) { + return OrcFile.Version.V_0_11; + } for (OrcFile.Version version: OrcFile.Version.values()) { - if ((versionList != null && !versionList.isEmpty()) && - version.getMajor() == versionList.get(0) && + if (version.getMajor() == versionList.get(0) && version.getMinor() == versionList.get(1)) { return version; } } - return OrcFile.Version.V_0_11; + return OrcFile.Version.FUTURE; + } + + @Override + public OrcFile.Version getFileVersion() { + return getFileVersion(versionList); } @Override diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index d9140e806c..27792127e0 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -107,10 +107,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private final int rowIndexStride; private final CompressionKind compress; private final CompressionCodec codec; - private final int bufferSize; + private int bufferSize; private final long blockSize; private final TypeDescription schema; private final PhysicalWriter physicalWriter; + private final OrcFile.WriterVersion writerVersion; private int columnCount; private long rowCount = 0; @@ -145,6 +146,7 @@ public WriterImpl(FileSystem fs, this.conf = opts.getConfiguration(); this.callback = opts.getCallback(); this.schema = opts.getSchema(); + this.writerVersion = opts.getWriterVersion(); bloomFilterVersion = opts.getBloomFilterVersion(); if (callback != null) { callbackContext = new OrcFile.WriterContext(){ @@ -211,6 +213,18 @@ public static int getEstimatedBufferSize(long stripeSize, int numColumns, return estBufferSize > bs ? bs : estBufferSize; } + /** + * Increase the buffer size for this writer. + * This function is internal only and should only be called by the + * ORC file merger. + * @param newSize the new buffer size. + */ + public void increaseCompressionSize(int newSize) { + if (newSize > bufferSize) { + bufferSize = newSize; + } + } + private static int getClosestBufferSize(int estBufferSize) { final int kb4 = 4 * 1024; final int kb8 = 8 * 1024; @@ -2736,7 +2750,7 @@ private long writePostScript() throws IOException { .setMagic(OrcFile.MAGIC) .addVersion(version.getMajor()) .addVersion(version.getMinor()) - .setWriterVersion(OrcFile.CURRENT_WRITER.getId()); + .setWriterVersion(writerVersion.getId()); if (compress != CompressionKind.NONE) { builder.setCompressionBlockSize(bufferSize); } @@ -2864,6 +2878,7 @@ public void appendStripe(byte[] stripe, int offset, int length, checkArgument(stripeStatistics != null, "Stripe statistics must not be null"); + rowsInStripe = stripeInfo.getNumberOfRows(); // update stripe information OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation .newBuilder() @@ -2883,6 +2898,7 @@ public void appendStripe(byte[] stripe, int offset, int length, stripes.add(dirEntry.build()); // reset it after writing the stripe + rowCount += rowsInStripe; rowsInStripe = 0; } diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 7df521df34..7801156d5b 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -20,6 +20,7 @@ import com.google.common.collect.Lists; +import org.apache.orc.impl.ReaderImpl; import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -51,12 +52,15 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; +import org.mockito.Mockito; import java.io.File; import java.io.IOException; import java.math.BigInteger; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; @@ -2944,4 +2948,170 @@ public void testWriterVersion() throws Exception { assertEquals(OrcFile.WriterVersion.ORIGINAL, OrcFile.WriterVersion.from(0)); assertEquals(OrcFile.WriterVersion.HIVE_4243, OrcFile.WriterVersion.from(2)); } + + /** + * Test whether the file versions are translated correctly + * @throws Exception + */ + @Test + public void testFileVersion() throws Exception { + assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(null)); + assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(new ArrayList())); + assertEquals(OrcFile.Version.V_0_11, + ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 11}))); + assertEquals(OrcFile.Version.V_0_12, + ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 12}))); + assertEquals(OrcFile.Version.FUTURE, + ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{9999, 0}))); + } + + @Test + public void testMergeUnderstood() throws Exception { + Path p = new Path("test.orc"); + Reader futureVersion = Mockito.mock(Reader.class); + Mockito.when(futureVersion.getFileVersion()).thenReturn(OrcFile.Version.FUTURE); + Mockito.when(futureVersion.getWriterVersion()).thenReturn(OrcFile.WriterVersion.HIVE_4243); + assertEquals(false, OrcFile.understandFormat(p, futureVersion)); + Reader futureWriter = Mockito.mock(Reader.class); + Mockito.when(futureWriter.getFileVersion()).thenReturn(OrcFile.Version.V_0_11); + Mockito.when(futureWriter.getWriterVersion()).thenReturn(OrcFile.WriterVersion.FUTURE); + assertEquals(false, OrcFile.understandFormat(p, futureWriter)); + Reader current = Mockito.mock(Reader.class); + Mockito.when(current.getFileVersion()).thenReturn(OrcFile.Version.CURRENT); + Mockito.when(current.getWriterVersion()).thenReturn(OrcFile.CURRENT_WRITER); + assertEquals(true, OrcFile.understandFormat(p, current)); + } + + static ByteBuffer fromString(String s) { + return ByteBuffer.wrap(s.getBytes(StandardCharsets.UTF_8)); + } + + static byte[] fromInt(int x) { + return Integer.toHexString(x).getBytes(StandardCharsets.UTF_8); + } + + @Test + public void testMerge() throws Exception { + Path input1 = new Path(workDir, "TestVectorOrcFile.testMerge1.orc"); + fs.delete(input1, false); + Path input2 = new Path(workDir, "TestVectorOrcFile.testMerge2.orc"); + fs.delete(input2, false); + Path input3 = new Path(workDir, "TestVectorOrcFile.testMerge3.orc"); + fs.delete(input3, false); + TypeDescription schema = TypeDescription.fromString("struct"); + // change all of the options away from default to find anything we + // don't copy to the merged file + OrcFile.WriterOptions opts = OrcFile.writerOptions(conf) + .setSchema(schema) + .compress(CompressionKind.LZO) + .enforceBufferSize() + .bufferSize(20*1024) + .rowIndexStride(1000) + .version(OrcFile.Version.V_0_11) + .writerVersion(OrcFile.WriterVersion.HIVE_8732); + + Writer writer = OrcFile.createWriter(input1, opts); + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 1024; + for(int r=0; r < 1024; ++r) { + ((LongColumnVector) batch.cols[0]).vector[r] = r; + ((BytesColumnVector) batch.cols[1]).setVal(r, fromInt(r)); + } + writer.addRowBatch(batch); + writer.addUserMetadata("a", fromString("foo")); + writer.addUserMetadata("b", fromString("bar")); + writer.close(); + + // increase the buffer size to 30k + opts.bufferSize(30*1024); + writer = OrcFile.createWriter(input2, opts); + batch.size = 1024; + for(int r=0; r < 1024; ++r) { + ((LongColumnVector) batch.cols[0]).vector[r] = 2 * r; + ((BytesColumnVector) batch.cols[1]).setVal(r, fromInt(2 * r)); + } + writer.addRowBatch(batch); + writer.addUserMetadata("a", fromString("foo")); + writer.addUserMetadata("c", fromString("baz")); + writer.close(); + + // decrease the buffer size to 10k + opts.bufferSize(10*1024); + writer = OrcFile.createWriter(input3, opts); + batch.size = 1024; + for(int r=0; r < 1024; ++r) { + ((LongColumnVector) batch.cols[0]).vector[r] = 3 * r; + ((BytesColumnVector) batch.cols[1]).setVal(r, fromInt(3 * r)); + } + writer.addRowBatch(batch); + writer.addUserMetadata("c", fromString("baz")); + writer.addUserMetadata("d", fromString("bat")); + writer.close(); + + Path output1 = new Path(workDir, "TestVectorOrcFile.testMerge.out1.orc"); + fs.delete(output1, false); + List paths = OrcFile.mergeFiles(output1, + OrcFile.writerOptions(conf), Arrays.asList(input1, input2, input3)); + assertEquals(3, paths.size()); + Reader reader = OrcFile.createReader(output1, OrcFile.readerOptions(conf)); + assertEquals(3 * 1024, reader.getNumberOfRows()); + assertEquals(CompressionKind.LZO, reader.getCompressionKind()); + assertEquals(30 * 1024, reader.getCompressionSize()); + assertEquals(1000, reader.getRowIndexStride()); + assertEquals(OrcFile.Version.V_0_11, reader.getFileVersion()); + assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion()); + assertEquals(3, reader.getStripes().size()); + assertEquals(4, reader.getMetadataKeys().size()); + assertEquals(fromString("foo"), reader.getMetadataValue("a")); + assertEquals(fromString("bar"), reader.getMetadataValue("b")); + assertEquals(fromString("baz"), reader.getMetadataValue("c")); + assertEquals(fromString("bat"), reader.getMetadataValue("d")); + + TypeDescription schema4 = TypeDescription.fromString("struct"); + Path input4 = new Path(workDir, "TestVectorOrcFile.testMerge4.orc"); + fs.delete(input4, false); + opts.setSchema(schema4); + writer = OrcFile.createWriter(input4, opts); + batch = schema4.createRowBatch(); + batch.size = 1024; + for(int r=0; r < 1024; ++r) { + ((LongColumnVector) batch.cols[0]).vector[r] = 4 * r; + } + writer.addRowBatch(batch); + writer.close(); + + Path input5 = new Path(workDir, "TestVectorOrcFile.testMerge5.orc"); + fs.delete(input5, false); + opts.setSchema(schema) + .compress(CompressionKind.NONE) + .bufferSize(100*1024); + writer = OrcFile.createWriter(input5, opts); + batch = schema.createRowBatch(); + batch.size = 1024; + for(int r=0; r < 1024; ++r) { + ((LongColumnVector) batch.cols[0]).vector[r] = 4 * r; + ((BytesColumnVector) batch.cols[1]).setVal(r, fromInt(5 * r)); + } + writer.addRowBatch(batch); + writer.close(); + + Path output2 = new Path(workDir, "TestVectorOrcFile.testMerge.out2.orc"); + fs.delete(output2, false); + paths = OrcFile.mergeFiles(output2, OrcFile.writerOptions(conf), + Arrays.asList(input3, input4, input1, input5)); + assertEquals(2, paths.size()); + reader = OrcFile.createReader(output2, OrcFile.readerOptions(conf)); + assertEquals(2 * 1024, reader.getNumberOfRows()); + assertEquals(CompressionKind.LZO, reader.getCompressionKind()); + assertEquals(20 * 1024, reader.getCompressionSize()); + assertEquals(1000, reader.getRowIndexStride()); + assertEquals(OrcFile.Version.V_0_11, reader.getFileVersion()); + assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion()); + assertEquals(2, reader.getStripes().size()); + assertEquals(4, reader.getMetadataKeys().size()); + assertEquals(fromString("foo"), reader.getMetadataValue("a")); + assertEquals(fromString("bar"), reader.getMetadataValue("b")); + assertEquals(fromString("baz"), reader.getMetadataValue("c")); + assertEquals(fromString("bat"), reader.getMetadataValue("d")); + } } From c0fbaed7c4bac987dc0bb4f9e91b930120100d0d Mon Sep 17 00:00:00 2001 From: Prasanth Jayachandran Date: Mon, 23 Jan 2017 14:43:09 -0800 Subject: [PATCH 04/21] ORC-134: DecimalColumnStatistics methods throws NPE when all column values are null (prasanthj) Fixes #86 Signed-off-by: Prasanth Jayachandran --- .gitignore | 2 ++ .../apache/orc/impl/ColumnStatisticsImpl.java | 6 ++-- .../org/apache/orc/TestVectorOrcFile.java | 29 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 167ac880e8..40178ecaeb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,7 @@ build target *~ *.iml +*.ipr +*.iws .idea .DS_Store diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 70018e39b6..cdf9c46e12 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -878,17 +878,17 @@ public OrcProto.ColumnStatistics.Builder serialize() { @Override public HiveDecimal getMinimum() { - return minimum.getHiveDecimal(); + return minimum == null ? null : minimum.getHiveDecimal(); } @Override public HiveDecimal getMaximum() { - return maximum.getHiveDecimal(); + return maximum == null ? null : maximum.getHiveDecimal(); } @Override public HiveDecimal getSum() { - return sum.getHiveDecimal(); + return sum == null ? null : sum.getHiveDecimal(); } @Override diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 7801156d5b..c3df81368a 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -487,6 +487,35 @@ public void testStringAndBinaryStatistics() throws Exception { rows.close(); } + @Test + public void testHiveDecimalStatsAllNulls() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("dec1", TypeDescription.createDecimal()); + + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 4; + DecimalColumnVector field1 = (DecimalColumnVector) batch.cols[0]; + field1.noNulls = false; + field1.isNull[0] = true; + field1.isNull[1] = true; + field1.isNull[2] = true; + field1.isNull[3] = true; + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + // check the stats + ColumnStatistics[] stats = reader.getStatistics(); + Assert.assertEquals(4, stats[0].getNumberOfValues()); + Assert.assertEquals(0, stats[1].getNumberOfValues()); + Assert.assertEquals(true, stats[1].hasNull()); + Assert.assertNull(((DecimalColumnStatistics)stats[1]).getMinimum()); + Assert.assertNull(((DecimalColumnStatistics)stats[1]).getMaximum()); + Assert.assertEquals(new HiveDecimalWritable(0).getHiveDecimal(), ((DecimalColumnStatistics)stats[1]).getSum()); + } @Test public void testStripeLevelStats() throws Exception { From 2abd1e80e5c845c42320979c387780edd9a43231 Mon Sep 17 00:00:00 2001 From: Tarun Kumar Date: Mon, 23 Jan 2017 19:02:53 +0530 Subject: [PATCH 05/21] ORC-100. Build should fail if there are findbug errors in modules (Tarun Kumar and omalley via omalley) Fixes #85 Signed-off-by: Owen O'Malley --- java/core/src/findbugs/exclude.xml | 15 ++++++++ .../src/java/org/apache/orc/DataReader.java | 2 +- java/core/src/java/org/apache/orc/Reader.java | 23 ++++++------ .../java/org/apache/orc/TypeDescription.java | 2 +- .../java/org/apache/orc/impl/BufferChunk.java | 14 ++++++++ .../apache/orc/impl/ColumnStatisticsImpl.java | 4 +-- .../orc/impl/ConvertTreeReaderFactory.java | 5 --- .../org/apache/orc/impl/DynamicByteArray.java | 2 +- .../apache/orc/impl/HadoopShimsCurrent.java | 2 +- .../org/apache/orc/impl/HadoopShims_2_2.java | 2 +- .../org/apache/orc/impl/PhysicalFsWriter.java | 2 +- .../java/org/apache/orc/impl/ReaderImpl.java | 13 ++++--- .../org/apache/orc/impl/RecordReaderImpl.java | 11 ++---- .../apache/orc/impl/RecordReaderUtils.java | 16 +++------ .../org/apache/orc/impl/SchemaEvolution.java | 2 +- .../java/org/apache/orc/impl/StreamName.java | 2 +- .../apache/orc/impl/TreeReaderFactory.java | 7 ---- .../java/org/apache/orc/impl/WriterImpl.java | 2 +- .../java/org/apache/orc/util/BloomFilter.java | 25 ++++++++----- java/pom.xml | 36 ++++++++++--------- java/tools/src/findbugs/exclude.xml | 19 ++++++++++ .../java/org/apache/orc/tools/FileDump.java | 7 ++-- .../org/apache/orc/tools/JsonFileDump.java | 2 +- 23 files changed, 127 insertions(+), 88 deletions(-) create mode 100644 java/tools/src/findbugs/exclude.xml diff --git a/java/core/src/findbugs/exclude.xml b/java/core/src/findbugs/exclude.xml index 2f1148def1..1f079cbdf5 100644 --- a/java/core/src/findbugs/exclude.xml +++ b/java/core/src/findbugs/exclude.xml @@ -16,4 +16,19 @@ + + + + + + + + + + + + + + + diff --git a/java/core/src/java/org/apache/orc/DataReader.java b/java/core/src/java/org/apache/orc/DataReader.java index b3f91f2d7c..ae11bf383f 100644 --- a/java/core/src/java/org/apache/orc/DataReader.java +++ b/java/core/src/java/org/apache/orc/DataReader.java @@ -25,7 +25,7 @@ import org.apache.orc.impl.OrcIndex; /** An abstract data reader that IO formats can use to read bytes from underlying storage. */ -public interface DataReader extends AutoCloseable { +public interface DataReader extends AutoCloseable, Cloneable { /** Opens the DataReader, making it ready to use. */ void open() throws IOException; diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index 3c9681c69f..2ef64d72a8 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -148,7 +148,7 @@ public interface Reader { /** * Options for creating a RecordReader. */ - public static class Options { + public static class Options implements Cloneable { private boolean[] include; private long offset = 0; private long length = Long.MAX_VALUE; @@ -311,18 +311,15 @@ public boolean getForcePositionalEvolution() { } public Options clone() { - Options result = new Options(); - result.include = include; - result.offset = offset; - result.length = length; - result.sarg = sarg; - result.schema = schema; - result.columnNames = columnNames; - result.useZeroCopy = useZeroCopy; - result.skipCorruptRecords = skipCorruptRecords; - result.dataReader = dataReader == null ? null : dataReader.clone(); - result.tolerateMissingSchema = tolerateMissingSchema; - return result; + try { + Options result = (Options) super.clone(); + if (dataReader != null) { + result.dataReader = dataReader.clone(); + } + return result; + } catch (CloneNotSupportedException e) { + throw new UnsupportedOperationException("uncloneable", e); + } } @Override diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 69d46b00e3..2086f3a428 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -40,7 +40,7 @@ * This is the description of the types in an ORC file. */ public class TypeDescription - implements Comparable, Serializable { + implements Comparable, Serializable, Cloneable { private static final int MAX_PRECISION = 38; private static final int MAX_SCALE = 38; private static final int DEFAULT_PRECISION = 38; diff --git a/java/core/src/java/org/apache/orc/impl/BufferChunk.java b/java/core/src/java/org/apache/orc/impl/BufferChunk.java index da43b96c5b..afde82f50c 100644 --- a/java/core/src/java/org/apache/orc/impl/BufferChunk.java +++ b/java/core/src/java/org/apache/orc/impl/BufferChunk.java @@ -78,6 +78,20 @@ public DiskRange sliceAndShift(long offset, long end, long shiftBy) { return new BufferChunk(sliceBuf, offset + shiftBy); } + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != getClass()) { + return false; + } + BufferChunk ob = (BufferChunk) other; + return chunk.equals(ob.chunk); + } + + @Override + public int hashCode() { + return chunk.hashCode(); + } + @Override public ByteBuffer getData() { return chunk; diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index cdf9c46e12..a8596db8ee 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -569,7 +569,7 @@ public void updateString(byte[] bytes, int offset, int length, maximum = new Text(); maximum.set(bytes, offset, length); } - sum += length * repetitions; + sum += (long)length * repetitions; } @Override @@ -711,7 +711,7 @@ public void updateBinary(BytesWritable value) { @Override public void updateBinary(byte[] bytes, int offset, int length, int repetitions) { - sum += length * repetitions; + sum += (long)length * repetitions; } @Override diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index e1415d3646..ae43824096 100644 --- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -1343,8 +1343,6 @@ public static class DecimalFromDecimalTreeReader extends ConvertTreeReader { private DecimalColumnVector fileDecimalColVector; private int filePrecision; private int fileScale; - private int readerPrecision; - private int readerScale; private DecimalColumnVector decimalColVector; DecimalFromDecimalTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, Context context) @@ -1352,8 +1350,6 @@ public static class DecimalFromDecimalTreeReader extends ConvertTreeReader { super(columnId); filePrecision = fileType.getPrecision(); fileScale = fileType.getScale(); - readerPrecision = readerType.getPrecision(); - readerScale = readerType.getScale(); decimalTreeReader = new DecimalTreeReader(columnId, context); setConvertTreeReader(decimalTreeReader); } @@ -1524,7 +1520,6 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader { private final TypeDescription readerType; private DecimalColumnVector decimalColVector; private BytesColumnVector bytesColVector; - private byte[] scratchBuffer; StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, Context context) throws IOException { diff --git a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java index 986c2ac81b..96ea1ea9ab 100644 --- a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java +++ b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java @@ -298,6 +298,6 @@ public byte[] get() { * Get the size of the buffers. */ public long getSizeInBytes() { - return initializedChunks * chunkSize; + return (long) initializedChunks * chunkSize; } } diff --git a/java/core/src/java/org/apache/orc/impl/HadoopShimsCurrent.java b/java/core/src/java/org/apache/orc/impl/HadoopShimsCurrent.java index 5c53f744c7..54a48242c7 100644 --- a/java/core/src/java/org/apache/orc/impl/HadoopShimsCurrent.java +++ b/java/core/src/java/org/apache/orc/impl/HadoopShimsCurrent.java @@ -71,7 +71,7 @@ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, return ZeroCopyShims.getZeroCopyReader(in, pool); } - private final class FastTextReaderShim implements TextReaderShim { + private static final class FastTextReaderShim implements TextReaderShim { private final DataInputStream din; public FastTextReaderShim(InputStream in) { diff --git a/java/core/src/java/org/apache/orc/impl/HadoopShims_2_2.java b/java/core/src/java/org/apache/orc/impl/HadoopShims_2_2.java index 3f65e74478..501101fff3 100644 --- a/java/core/src/java/org/apache/orc/impl/HadoopShims_2_2.java +++ b/java/core/src/java/org/apache/orc/impl/HadoopShims_2_2.java @@ -70,7 +70,7 @@ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, return null; } - private final class BasicTextReaderShim implements TextReaderShim { + private static final class BasicTextReaderShim implements TextReaderShim { private final InputStream in; public BasicTextReaderShim(InputStream in) { diff --git a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java index 17c73ffe23..dd4e23b725 100644 --- a/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java +++ b/java/core/src/java/org/apache/orc/impl/PhysicalFsWriter.java @@ -147,7 +147,7 @@ private void padStripe(long indexSize, long dataSize, int footerSize) throws IOE * An output receiver that writes the ByteBuffers to the output stream * as they are received. */ - private class DirectStream implements OutputReceiver { + private static class DirectStream implements OutputReceiver { private final FSDataOutputStream output; DirectStream(FSDataOutputStream output) { diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 9c8c06bbc0..ac5cfb2897 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -690,11 +690,14 @@ private List getColumnIndicesFromNames(List colNames) { if (fieldNames.contains(colName)) { fieldIdx = fieldNames.indexOf(colName); } else { - String s = "Cannot find field for: " + colName + " in "; + StringBuilder s = new StringBuilder("Cannot find field for: "); + s.append(colName); + s.append(" in "); for (String fn : fieldNames) { - s += fn + ", "; + s.append(fn); + s.append(", "); } - LOG.warn(s); + LOG.warn(s.toString()); continue; } @@ -747,8 +750,10 @@ public List getOrcProtoFileStatistics() { @Override public List getStripeStatistics() throws IOException { - if (stripeStats == null && metadata == null) { + if (metadata == null) { metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize); + } + if (stripeStats == null) { stripeStats = metadata.getStripeStatsList(); } List result = new ArrayList<>(); diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index bb6dd2d409..2624cc7c77 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -411,9 +411,7 @@ static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min, // in case failed conversion, return the default YES_NO_NULL truth value } catch (Exception e) { if (LOG.isWarnEnabled()) { - final String statsType = min == null ? - (max == null ? "null" : max.getClass().getSimpleName()) : - min.getClass().getSimpleName(); + final String statsType = min.getClass().getSimpleName(); final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName(); final String reason = e.getClass().getSimpleName() + " when evaluating predicate." + " Skipping ORC PPD." + @@ -682,10 +680,7 @@ private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object } break; case STRING: - if (obj != null) { - return (obj.toString()); - } - break; + return (obj.toString()); case TIMESTAMP: if (obj instanceof Timestamp) { return obj; @@ -712,7 +707,7 @@ private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object } throw new IllegalArgumentException(String.format( - "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass() + "ORC SARGS could not convert from %s to %s", obj.getClass() .getSimpleName(), type)); } diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java index cadee35b77..1b8db99658 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java @@ -149,16 +149,6 @@ private static class DefaultDataReader implements DataReader { private final int bufferSize; private final int typeCount; - private DefaultDataReader(DefaultDataReader other) { - this.pool = other.pool; - this.bufferSize = other.bufferSize; - this.typeCount = other.typeCount; - this.fs = other.fs; - this.path = other.path; - this.useZeroCopy = other.useZeroCopy; - this.codec = other.codec; - } - private DefaultDataReader(DataReaderProperties properties) { this.fs = properties.getFileSystem(); this.path = properties.getPath(); @@ -308,7 +298,11 @@ public void releaseBuffer(ByteBuffer buffer) { @Override public DataReader clone() { - return new DefaultDataReader(this); + try { + return (DataReader) super.clone(); + } catch (CloneNotSupportedException e) { + throw new UnsupportedOperationException("uncloneable", e); + } } } diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index f6f3ac735b..a438db3abf 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -490,7 +490,7 @@ static TypeDescription getBaseRow(TypeDescription typeDescription) { return typeDescription.getChildren().get(ACID_ROW_OFFSET); } - public static final List acidEventFieldNames= + private static final List acidEventFieldNames= new ArrayList(); static { diff --git a/java/core/src/java/org/apache/orc/impl/StreamName.java b/java/core/src/java/org/apache/orc/impl/StreamName.java index e3561bf1e0..1e877887ee 100644 --- a/java/core/src/java/org/apache/orc/impl/StreamName.java +++ b/java/core/src/java/org/apache/orc/impl/StreamName.java @@ -53,7 +53,7 @@ public int compareTo(StreamName streamName) { Area area = getArea(kind); Area otherArea = streamName.getArea(streamName.kind); if (area != otherArea) { - return -area.compareTo(otherArea); + return otherArea.compareTo(area); } if (column != streamName.column) { return column < streamName.column ? -1 : 1; diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index ccf2f2957d..0308393c62 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -96,7 +96,6 @@ public String getWriterTimezone() { public abstract static class TreeReader { protected final int columnId; protected BitFieldReader present = null; - protected boolean valuePresent = false; protected int vectorColumnCount; protected final Context context; @@ -109,7 +108,6 @@ protected TreeReader(int columnId, InStream in, Context context) throws IOExcept this.context = context; if (in == null) { present = null; - valuePresent = true; } else { present = new BitFieldReader(in, 1); } @@ -151,7 +149,6 @@ void startStripe(Map streams, OrcProto.Stream.Kind.PRESENT)); if (in == null) { present = null; - valuePresent = true; } else { present = new BitFieldReader(in, 1); } @@ -1369,7 +1366,6 @@ public static void readOrcByteArrays(InStream stream, public static class StringDirectTreeReader extends TreeReader { private static final HadoopShims SHIMS = HadoopShims.Factory.get(); protected InStream stream; - protected HadoopShims.TextReaderShim data; protected IntegerReader lengths; private final LongColumnVector scratchlcv; @@ -1384,7 +1380,6 @@ protected StringDirectTreeReader(int columnId, InStream present, InStream data, this.stream = data; if (length != null && encoding != null) { this.lengths = createIntegerReader(encoding, length, false, context); - this.data = SHIMS.getTextReaderShim(this.stream); } } @@ -1405,7 +1400,6 @@ void startStripe(Map streams, StreamName name = new StreamName(columnId, OrcProto.Stream.Kind.DATA); stream = streams.get(name); - data = SHIMS.getTextReaderShim(this.stream); lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(), streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, context); @@ -2146,7 +2140,6 @@ public static TreeReader createTreeReader(TypeDescription readerType, Context context ) throws IOException { final SchemaEvolution evolution = context.getSchemaEvolution(); - final boolean[] included = evolution.getReaderIncluded(); TypeDescription fileType = evolution.getFileType(readerType); if (fileType == null || !evolution.includeReaderColumn(readerType.getId())){ return new NullTreeReader(0); diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 27792127e0..7766408762 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -208,7 +208,7 @@ public static int getEstimatedBufferSize(long stripeSize, int numColumns, // we want to guarantee that each stream gets ~10 buffers. // This keeps buffers small enough that we don't get really small stripe // sizes. - int estBufferSize = (int) (stripeSize / (20 * numColumns)); + int estBufferSize = (int) (stripeSize / (20L * numColumns)); estBufferSize = getClosestBufferSize(estBufferSize); return estBufferSize > bs ? bs : estBufferSize; } diff --git a/java/core/src/java/org/apache/orc/util/BloomFilter.java b/java/core/src/java/org/apache/orc/util/BloomFilter.java index 0d9db2439e..d609d56c2e 100644 --- a/java/core/src/java/org/apache/orc/util/BloomFilter.java +++ b/java/core/src/java/org/apache/orc/util/BloomFilter.java @@ -99,12 +99,13 @@ public boolean equals(Object other) { bitSet.equals(((BloomFilter) other).bitSet); } + @Override + public int hashCode() { + return bitSet.hashCode() + numHashFunctions * 5; + } + public void add(byte[] val) { - if (val == null) { - addBytes(val, -1, -1); - } else { - addBytes(val, 0, val.length); - } + addBytes(val, 0, val == null ? 0 : val.length); } public void addBytes(byte[] val, int offset, int length) { @@ -151,10 +152,7 @@ public void addDouble(double val) { } public boolean test(byte[] val) { - if (val == null) { - return testBytes(val, -1, -1); - } - return testBytes(val, 0, val.length); + return testBytes(val, 0, val == null ? 0 : val.length); } public boolean testBytes(byte[] val, int offset, int length) { @@ -324,5 +322,14 @@ public boolean equals(Object other) { other.getClass() == getClass() && Arrays.equals(data, ((BitSet) other).data); } + + @Override + public int hashCode() { + int result = 0; + for(long l: data) { + result = (int) (result * 13 + l); + } + return result; + } } } diff --git a/java/pom.xml b/java/pom.xml index 6f0246e3ad..914f824d4e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -145,6 +145,26 @@ maven-project-info-reports-plugin 2.9 + + org.codehaus.mojo + findbugs-maven-plugin + 3.0.3 + + Max + ${basedir}/src/findbugs/exclude.xml + true + ${project.build.directory}/findbugs + + + + analyze-compile + compile + + check + + + + @@ -203,22 +223,6 @@ - - - - org.codehaus.mojo - findbugs-maven-plugin - 3.0.3 - - Max - ${basedir}/src/findbugs/exclude.xml - true - ${project.build.directory}/findbugs - - - - - cmake diff --git a/java/tools/src/findbugs/exclude.xml b/java/tools/src/findbugs/exclude.xml new file mode 100644 index 0000000000..d6441f15d3 --- /dev/null +++ b/java/tools/src/findbugs/exclude.xml @@ -0,0 +1,19 @@ + + + + + + + diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index 7206503132..7b79ce5a46 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -18,6 +18,7 @@ package org.apache.orc.tools; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collection; @@ -432,11 +433,11 @@ private static void recoverFiles(final List corruptFiles, final Configur // find all MAGIC string and see if the file is readable from there int index = 0; long nextFooterOffset; - + byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8); while (index != -1) { - index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1); + index = indexOf(data, magicBytes, index + 1); if (index != -1) { - nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1; + nextFooterOffset = startPos + index + magicBytes.length + 1; if (isReadable(corruptPath, conf, nextFooterOffset)) { footerOffsets.add(nextFooterOffset); } diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index aa3072c318..0de07ad282 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -220,7 +220,7 @@ public static void printJsonMetaData(List files, rows.close(); writer.endObject(); - } catch (Exception e) { + } catch (Throwable e) { writer.key("status").value("FAILED"); throw e; } From 192407a272a55c9f631c91a135d3b5866a228a14 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Fri, 27 Jan 2017 16:09:40 -0800 Subject: [PATCH 06/21] Update versions after 1.3.0 release. Signed-off-by: Owen O'Malley --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2fee92ab7e..35ffd89372 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "0") +SET(CPACK_PACKAGE_VERSION_PATCH "1") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index 93b2a8bf73..a4e7e2d786 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0 + 1.3.1 ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 15ba425d32..0d760570d5 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0 + 1.3.1 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 914f824d4e..5d8cc219d9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.0 + 1.3.1 pom Apache ORC @@ -238,17 +238,17 @@ org.apache.orc orc-core - 1.3.0 + 1.3.1 org.apache.orc orc-mapreduce - 1.3.0 + 1.3.1 org.apache.orc orc-tools - 1.3.0 + 1.3.1 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 9b49682083..d9726ba0d0 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.0 + 1.3.1 ../pom.xml From ba19584d35c6e84dcda0bb3aa44b0c9dd3981411 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Fri, 27 Jan 2017 16:07:34 -0800 Subject: [PATCH 07/21] ORC-138. Fix part of HIVE-15335 that changed decimal schema evolution. (omalley) Fixes #88 Signed-off-by: Owen O'Malley --- .../orc/impl/ConvertTreeReaderFactory.java | 64 ++++++++++++++++--- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index ae43824096..7964340a85 100644 --- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -599,18 +599,55 @@ public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader { setConvertTreeReader(decimalTreeReader); } - private static HiveDecimal DECIMAL_MAX_LONG = HiveDecimal.create(Long.MAX_VALUE); - private static HiveDecimal DECIMAL_MIN_LONG = HiveDecimal.create(Long.MIN_VALUE); - @Override public void setConvertVectorElement(int elementNum) throws IOException { - HiveDecimal decimalValue = decimalColVector.vector[elementNum].getHiveDecimal(); - if (decimalValue.compareTo(DECIMAL_MAX_LONG) > 0 || - decimalValue.compareTo(DECIMAL_MIN_LONG) < 0) { + HiveDecimalWritable decWritable = decimalColVector.vector[elementNum]; + long[] vector = longColVector.vector; + Category readerCategory = readerType.getCategory(); + + // Check to see if the decimal will fit in the Hive integer data type. + // If not, set the element to null. + boolean isInRange; + switch (readerCategory) { + case BOOLEAN: + // No data loss for boolean. + vector[elementNum] = decWritable.signum() == 0 ? 0 : 1; + return; + case BYTE: + isInRange = decWritable.isByte(); + break; + case SHORT: + isInRange = decWritable.isShort(); + break; + case INT: + isInRange = decWritable.isInt(); + break; + case LONG: + isInRange = decWritable.isLong(); + break; + default: + throw new RuntimeException("Unexpected type kind " + readerCategory.name()); + } + if (!isInRange) { longColVector.isNull[elementNum] = true; longColVector.noNulls = false; } else { - downCastAnyInteger(longColVector, elementNum, decimalValue.longValue(), readerType); + switch (readerCategory) { + case BYTE: + vector[elementNum] = decWritable.byteValue(); + break; + case SHORT: + vector[elementNum] = decWritable.shortValue(); + break; + case INT: + vector[elementNum] = decWritable.intValue(); + break; + case LONG: + vector[elementNum] = decWritable.longValue(); + break; + default: + throw new RuntimeException("Unexpected type kind " + readerCategory.name()); + } } } @@ -1520,6 +1557,7 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader { private final TypeDescription readerType; private DecimalColumnVector decimalColVector; private BytesColumnVector bytesColVector; + private byte[] scratchBuffer; StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, Context context) throws IOException { @@ -1529,13 +1567,19 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader { this.readerType = readerType; decimalTreeReader = new DecimalTreeReader(columnId, context); setConvertTreeReader(decimalTreeReader); + scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES]; } @Override public void setConvertVectorElement(int elementNum) { - String string = decimalColVector.vector[elementNum].getHiveDecimal().toString(); - byte[] bytes = string.getBytes(StandardCharsets.UTF_8); - assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes); + HiveDecimalWritable decWritable = decimalColVector.vector[elementNum]; + + // Convert decimal into bytes instead of a String for better performance. + final int byteIndex = decWritable.toBytes(scratchBuffer); + + assignStringGroupVectorEntry( + bytesColVector, elementNum, readerType, + scratchBuffer, byteIndex, HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES - byteIndex); } @Override From 2acb0cbd6997a0ff89d92f70137b87b8e1e54ede Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 31 Jan 2017 09:19:07 -0800 Subject: [PATCH 08/21] bump version number to 1.3.2-snapshot --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35ffd89372..1714d0b769 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "1") +SET(CPACK_PACKAGE_VERSION_PATCH "2-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index a4e7e2d786..8fae424c7e 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.1 + 1.3.2-SNAPSHOT ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 0d760570d5..affa963e4a 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.1 + 1.3.2-SNAPSHOT ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 5d8cc219d9..8becf3d57f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.1 + 1.3.2-SNAPSHOT pom Apache ORC @@ -238,17 +238,17 @@ org.apache.orc orc-core - 1.3.1 + 1.3.2-SNAPSHOT org.apache.orc orc-mapreduce - 1.3.1 + 1.3.2-SNAPSHOT org.apache.orc orc-tools - 1.3.1 + 1.3.2-SNAPSHOT diff --git a/java/tools/pom.xml b/java/tools/pom.xml index d9726ba0d0..0908c4dc3b 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.1 + 1.3.2-SNAPSHOT ../pom.xml From d08d21604bfa014f8b90c6195845169de227c4cc Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Sat, 4 Feb 2017 10:04:00 -0800 Subject: [PATCH 09/21] ORC-141. Make setting a memory manager for the writer public. (omalley) Fixes #89 Signed-off-by: Owen O'Malley --- .../java/org/apache/orc/MemoryManager.java | 68 +++++++++++++++++++ .../core/src/java/org/apache/orc/OrcFile.java | 7 +- ...oryManager.java => MemoryManagerImpl.java} | 18 ++--- .../java/org/apache/orc/impl/WriterImpl.java | 1 + .../org/apache/orc/TestVectorOrcFile.java | 16 +---- .../apache/orc/impl/TestMemoryManager.java | 10 +-- 6 files changed, 84 insertions(+), 36 deletions(-) create mode 100644 java/core/src/java/org/apache/orc/MemoryManager.java rename java/core/src/java/org/apache/orc/impl/{MemoryManager.java => MemoryManagerImpl.java} (94%) diff --git a/java/core/src/java/org/apache/orc/MemoryManager.java b/java/core/src/java/org/apache/orc/MemoryManager.java new file mode 100644 index 0000000000..3afd3f56c1 --- /dev/null +++ b/java/core/src/java/org/apache/orc/MemoryManager.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +/** + * A memory manager that keeps a global context of how many ORC + * writers there are and manages the memory between them. For use cases with + * dynamic partitions, it is easy to end up with many writers in the same task. + * By managing the size of each allocation, we try to cut down the size of each + * allocation and keep the task from running out of memory. + * + * This class is not thread safe, but is re-entrant - ensure creation and all + * invocations are triggered from the same thread. + */ +public interface MemoryManager { + + interface Callback { + /** + * The writer needs to check its memory usage + * @param newScale the current scale factor for memory allocations + * @return true if the writer was over the limit + * @throws IOException + */ + boolean checkMemory(double newScale) throws IOException; + } + + /** + * Add a new writer's memory allocation to the pool. We use the path + * as a unique key to ensure that we don't get duplicates. + * @param path the file that is being written + * @param requestedAllocation the requested buffer size + */ + void addWriter(Path path, long requestedAllocation, + Callback callback) throws IOException; + + /** + * Remove the given writer from the pool. + * @param path the file that has been closed + */ + void removeWriter(Path path) throws IOException; + + /** + * Give the memory manager an opportunity for doing a memory check. + * @param rows number of rows added + * @throws IOException + */ + void addedRow(int rows) throws IOException; +} diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index cfabba9b87..e3d108a4e5 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -30,7 +30,8 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.orc.impl.MemoryManager; +import org.apache.orc.MemoryManager; +import org.apache.orc.impl.MemoryManagerImpl; import org.apache.orc.impl.OrcTail; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.WriterImpl; @@ -514,7 +515,7 @@ public WriterOptions physicalWriter(PhysicalWriter writer) { /** * A package local option to set the memory manager. */ - protected WriterOptions memory(MemoryManager value) { + public WriterOptions memory(MemoryManager value) { memoryManagerValue = value; return this; } @@ -647,7 +648,7 @@ private static synchronized MemoryManager getStaticMemoryManager( memoryManager = new ThreadLocal() { @Override protected MemoryManager initialValue() { - return new MemoryManager(conf); + return new MemoryManagerImpl(conf); } }; } diff --git a/java/core/src/java/org/apache/orc/impl/MemoryManager.java b/java/core/src/java/org/apache/orc/impl/MemoryManagerImpl.java similarity index 94% rename from java/core/src/java/org/apache/orc/impl/MemoryManager.java rename to java/core/src/java/org/apache/orc/impl/MemoryManagerImpl.java index 23f753384a..bd6c1529fd 100644 --- a/java/core/src/java/org/apache/orc/impl/MemoryManager.java +++ b/java/core/src/java/org/apache/orc/impl/MemoryManagerImpl.java @@ -18,6 +18,7 @@ package org.apache.orc.impl; +import org.apache.orc.MemoryManager; import org.apache.orc.OrcConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,9 +41,9 @@ * This class is not thread safe, but is re-entrant - ensure creation and all * invocations are triggered from the same thread. */ -public class MemoryManager { +public class MemoryManagerImpl implements MemoryManager { - private static final Logger LOG = LoggerFactory.getLogger(MemoryManager.class); + private static final Logger LOG = LoggerFactory.getLogger(MemoryManagerImpl.class); /** * How often should we check the memory sizes? Measured in rows added @@ -73,22 +74,12 @@ private static class WriterInfo { } } - public interface Callback { - /** - * The writer needs to check its memory usage - * @param newScale the current scale factor for memory allocations - * @return true if the writer was over the limit - * @throws IOException - */ - boolean checkMemory(double newScale) throws IOException; - } - /** * Create the memory manager. * @param conf use the configuration to find the maximum size of the memory * pool. */ - public MemoryManager(Configuration conf) { + public MemoryManagerImpl(Configuration conf) { double maxLoad = OrcConf.MEMORY_POOL.getDouble(conf); totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean(). getHeapMemoryUsage().getMax() * maxLoad); @@ -174,6 +165,7 @@ public double getAllocationScale() { * @param rows number of rows added * @throws IOException */ + @Override public void addedRow(int rows) throws IOException { rowsAddedSinceCheck += rows; if (rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) { diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 7766408762..925164d5e9 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -40,6 +40,7 @@ import org.apache.orc.ColumnStatistics; import org.apache.orc.CompressionCodec; import org.apache.orc.CompressionKind; +import org.apache.orc.MemoryManager; import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; import org.apache.orc.OrcProto; diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index c3df81368a..b7fa8eee88 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -44,7 +44,6 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.orc.impl.DataReaderProperties; -import org.apache.orc.impl.MemoryManager; import org.apache.orc.impl.OrcIndex; import org.apache.orc.impl.RecordReaderImpl; import org.apache.orc.impl.RecordReaderUtils; @@ -59,7 +58,6 @@ import java.math.BigInteger; import java.net.URL; import java.nio.ByteBuffer; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.sql.Date; import java.sql.Timestamp; @@ -2038,8 +2036,7 @@ private void checkRandomRow(VectorizedRowBatch batch, new MiddleStruct(inner, inner2), list(), map(inner, inner2)); } - private static class MyMemoryManager extends MemoryManager { - final long totalSpace; + private static class MyMemoryManager implements MemoryManager { double rate; Path path = null; long lastAllocation = 0; @@ -2047,8 +2044,6 @@ private static class MyMemoryManager extends MemoryManager { Callback callback; MyMemoryManager(Configuration conf, long totalSpace, double rate) { - super(conf); - this.totalSpace = totalSpace; this.rate = rate; } @@ -2066,15 +2061,6 @@ public synchronized void removeWriter(Path path) { this.lastAllocation = 0; } - @Override - public long getTotalMemoryPool() { - return totalSpace; - } - - @Override - public double getAllocationScale() { - return rate; - } @Override public void addedRow(int count) throws IOException { diff --git a/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java b/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java index f48c545098..b84e6ed70a 100644 --- a/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java +++ b/java/core/src/test/org/apache/orc/impl/TestMemoryManager.java @@ -19,7 +19,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.orc.impl.MemoryManager; +import org.apache.orc.MemoryManager; import org.hamcrest.BaseMatcher; import org.hamcrest.Description; import org.junit.Test; @@ -39,7 +39,7 @@ public class TestMemoryManager { private static final double ERROR = 0.000001; - private static class NullCallback implements MemoryManager.Callback { + private static class NullCallback implements MemoryManagerImpl.Callback { public boolean checkMemory(double newScale) { return false; } @@ -48,7 +48,7 @@ public boolean checkMemory(double newScale) { @Test public void testBasics() throws Exception { Configuration conf = new Configuration(); - MemoryManager mgr = new MemoryManager(conf); + MemoryManagerImpl mgr = new MemoryManagerImpl(conf); NullCallback callback = new NullCallback(); long poolSize = mgr.getTotalMemoryPool(); assertEquals(Math.round(ManagementFactory.getMemoryMXBean(). @@ -77,7 +77,7 @@ public void testBasics() throws Exception { public void testConfig() throws Exception { Configuration conf = new Configuration(); conf.set("hive.exec.orc.memory.pool", "0.9"); - MemoryManager mgr = new MemoryManager(conf); + MemoryManagerImpl mgr = new MemoryManagerImpl(conf); long mem = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax(); System.err.print("Memory = " + mem); @@ -114,7 +114,7 @@ private static DoubleMatcher closeTo(double value, double error) { @Test public void testCallback() throws Exception { Configuration conf = new Configuration(); - MemoryManager mgr = new MemoryManager(conf); + MemoryManagerImpl mgr = new MemoryManagerImpl(conf); long pool = mgr.getTotalMemoryPool(); MemoryManager.Callback[] calls = new MemoryManager.Callback[20]; for(int i=0; i < calls.length; ++i) { From cf9b18d962f28771646063233499ce9890604825 Mon Sep 17 00:00:00 2001 From: Prasanth Jayachandran Date: Thu, 26 Jan 2017 17:48:25 -0800 Subject: [PATCH 10/21] ORC-135: PPD for timestamp is wrong when reader and writer timezones are different zones. (prasanthj and omalley) Fixes #87 Fixes #90 Signed-off-by: Owen O'Malley --- c++/include/orc/Reader.hh | 1 + c++/src/Reader.cc | 2 + .../core/src/java/org/apache/orc/OrcFile.java | 3 +- .../apache/orc/impl/ColumnStatisticsImpl.java | 38 +- .../org/apache/orc/impl/RecordReaderImpl.java | 30 +- .../apache/orc/impl/RecordReaderUtils.java | 2 + .../apache/orc/impl/SerializationUtils.java | 11 + .../apache/orc/impl/TreeReaderFactory.java | 16 +- .../java/org/apache/orc/impl/WriterImpl.java | 138 ++--- .../org/apache/orc/util/BloomFilterIO.java | 40 ++ .../org/apache/orc/TestColumnStatistics.java | 57 ++- .../org/apache/orc/TestOrcTimezonePPD.java | 407 +++++++++++++++ .../apache/orc/impl/TestRecordReaderImpl.java | 478 ++++++++++-------- .../org/apache/orc/util/TestBloomFilter.java | 7 +- .../java/org/apache/orc/tools/FileDump.java | 8 +- .../org/apache/orc/tools/JsonFileDump.java | 8 +- .../resources/orc-file-dump-bloomfilter.out | 94 ++-- .../resources/orc-file-dump-bloomfilter2.out | 118 ++--- .../orc-file-dump-dictionary-threshold.out | 2 +- .../src/test/resources/orc-file-dump.json | 110 ++-- .../src/test/resources/orc-file-dump.out | 2 +- .../src/test/resources/orc-file-has-null.out | 2 +- proto/orc_proto.proto | 11 +- 23 files changed, 1093 insertions(+), 492 deletions(-) create mode 100644 java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index e9466484eb..76d7853451 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -56,6 +56,7 @@ namespace orc { WriterVersion_HIVE_12055 = 3, WriterVersion_HIVE_13083 = 4, WriterVersion_ORC_101 = 5, + WriterVersion_ORC_135 = 6, WriterVersion_MAX = INT64_MAX }; diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 1ddaebd3a3..034586d013 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -69,6 +69,8 @@ namespace orc { return "HIVE-13083"; case WriterVersion_ORC_101: return "ORC-101"; + case WriterVersion_ORC_135: + return "ORC-135"; } std::stringstream buffer; buffer << "future - " << version; diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index e3d108a4e5..f8555abcdb 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -120,6 +120,7 @@ public enum WriterVersion { HIVE_12055(3), // vectorized writer HIVE_13083(4), // decimal writer updating present stream wrongly ORC_101(5), // bloom filters use utf8 + ORC_135(6), // timestamp stats use utc // Don't use any magic numbers here except for the below: FUTURE(Integer.MAX_VALUE); // a version from a future writer @@ -173,7 +174,7 @@ public boolean includes(WriterVersion other) { /** * The WriterVersion for this version of the software. */ - public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_101; + public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_135; public enum EncodingStrategy { SPEED, COMPRESSION diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index a8596db8ee..f1ed646773 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -19,6 +19,7 @@ import java.sql.Date; import java.sql.Timestamp; +import java.util.TimeZone; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.common.type.HiveDecimal; @@ -1118,10 +1119,18 @@ private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics(); // min,max values serialized/deserialized as int (milliseconds since epoch) if (timestampStats.hasMaximum()) { - maximum = timestampStats.getMaximum(); + maximum = SerializationUtils.convertToUtc(TimeZone.getDefault(), + timestampStats.getMaximum()); } if (timestampStats.hasMinimum()) { - minimum = timestampStats.getMinimum(); + maximum = SerializationUtils.convertToUtc(TimeZone.getDefault(), + timestampStats.getMinimum()); + } + if (timestampStats.hasMaximumUtc()) { + maximum = timestampStats.getMaximumUtc(); + } + if (timestampStats.hasMinimumUtc()) { + minimum = timestampStats.getMinimumUtc(); } } @@ -1134,14 +1143,9 @@ public void reset() { @Override public void updateTimestamp(Timestamp value) { - if (minimum == null) { - minimum = value.getTime(); - maximum = value.getTime(); - } else if (minimum > value.getTime()) { - minimum = value.getTime(); - } else if (maximum < value.getTime()) { - maximum = value.getTime(); - } + long millis = SerializationUtils.convertToUtc(TimeZone.getDefault(), + value.getTime()); + updateTimestamp(millis); } @Override @@ -1185,8 +1189,8 @@ public OrcProto.ColumnStatistics.Builder serialize() { OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics .newBuilder(); if (getNumberOfValues() != 0 && minimum != null) { - timestampStats.setMinimum(minimum); - timestampStats.setMaximum(maximum); + timestampStats.setMinimumUtc(minimum); + timestampStats.setMaximumUtc(maximum); } result.setTimestampStatistics(timestampStats); return result; @@ -1194,18 +1198,22 @@ public OrcProto.ColumnStatistics.Builder serialize() { @Override public Timestamp getMinimum() { - return minimum == null ? null : new Timestamp(minimum); + return minimum == null ? null : + new Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(), + minimum)); } @Override public Timestamp getMaximum() { - return maximum == null ? null : new Timestamp(maximum); + return maximum == null ? null : + new Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(), + maximum)); } @Override public String toString() { StringBuilder buf = new StringBuilder(super.toString()); - if (getNumberOfValues() != 0) { + if (minimum != null || maximum != null) { buf.append(" min: "); buf.append(getMinimum()); buf.append(" max: "); diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 2624cc7c77..766eb00c0f 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -26,6 +26,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.TimeZone; import org.apache.orc.OrcFile; import org.apache.orc.util.BloomFilter; @@ -171,7 +172,7 @@ protected RecordReaderImpl(ReaderImpl fileReader, OrcConf.IGNORE_NON_UTF8_BLOOM_FILTERS.getBoolean(fileReader.conf); SearchArgument sarg = options.getSearchArgument(); if (sarg != null && rowIndexStride != 0) { - sargApp = new SargApplier(sarg, options.getColumnNames(), + sargApp = new SargApplier(sarg, rowIndexStride, evolution, writerVersion); @@ -357,14 +358,30 @@ static Object getMin(ColumnStatistics index) { static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto, PredicateLeaf predicate, OrcProto.Stream.Kind kind, + OrcProto.ColumnEncoding encoding, OrcProto.BloomFilter bloomFilter, OrcFile.WriterVersion writerVersion, TypeDescription.Category type) { ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto); Object minValue = getMin(cs); Object maxValue = getMax(cs); + // files written before ORC-135 stores timestamp wrt to local timezone causing issues with PPD. + // disable PPD for timestamp for all old files + if (type.equals(TypeDescription.Category.TIMESTAMP)) { + if (!writerVersion.includes(OrcFile.WriterVersion.ORC_135)) { + LOG.warn("Not using predication pushdown on {} because it doesn't " + + "include ORC-135. Writer version: {}", predicate.getColumnName(), + writerVersion); + return TruthValue.YES_NO_NULL; + } + if (predicate.getType() != PredicateLeaf.Type.TIMESTAMP && + predicate.getType() != PredicateLeaf.Type.DATE && + predicate.getType() != PredicateLeaf.Type.STRING) { + return TruthValue.YES_NO_NULL; + } + } return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), - BloomFilterIO.deserialize(kind, writerVersion, type, bloomFilter)); + BloomFilterIO.deserialize(kind, encoding, writerVersion, type, bloomFilter)); } /** @@ -518,7 +535,6 @@ private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Objec loc = compareToRange((Comparable) predObj1, minValue, maxValue); if (loc == Location.BEFORE || loc == Location.MIN) { Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1)); - Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue); if (loc2 == Location.AFTER || loc2 == Location.MAX) { return hasNull ? TruthValue.YES_NULL : TruthValue.YES; @@ -581,7 +597,7 @@ private static TruthValue checkInBloomFilter(BloomFilter bf, Object predObj, boo result = TruthValue.YES_NO_NULL; } } else if (predObj instanceof Timestamp) { - if (bf.testLong(((Timestamp) predObj).getTime())) { + if (bf.testLong(SerializationUtils.convertToUtc(TimeZone.getDefault(), ((Timestamp) predObj).getTime()))) { result = TruthValue.YES_NO_NULL; } } else if (predObj instanceof Date) { @@ -725,7 +741,6 @@ public static class SargApplier { private SchemaEvolution evolution; public SargApplier(SearchArgument sarg, - String[] columnNames, long rowIndexStride, SchemaEvolution evolution, OrcFile.WriterVersion writerVersion) { @@ -758,6 +773,7 @@ public SargApplier(SearchArgument sarg, public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes, OrcProto.Stream.Kind[] bloomFilterKinds, + List encodings, OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException { long rowsInStripe = stripe.getNumberOfRows(); @@ -785,7 +801,7 @@ public boolean[] pickRowGroups(StripeInformation stripe, } if (evolution != null && evolution.isPPDSafeConversion(columnIx)) { leafValues[pred] = evaluatePredicateProto(stats, - sargLeaves.get(pred), bfk, bf, writerVersion, + sargLeaves.get(pred), bfk, encodings.get(columnIx), bf, writerVersion, evolution.getFileSchema().findSubtype(columnIx).getCategory()); } else { leafValues[pred] = TruthValue.YES_NO_NULL; @@ -827,7 +843,7 @@ protected boolean[] pickRowGroups() throws IOException { } readRowIndex(currentStripe, fileIncluded, sargApp.sargColumns); return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, - bloomFilterKind, bloomFilterIndices, false); + bloomFilterKind, stripeFooter.getColumnsList(), bloomFilterIndices, false); } private void clearStreams() { diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java index 1b8db99658..aa47219ea4 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java @@ -53,6 +53,8 @@ static boolean hadBadBloomFilters(TypeDescription.Category category, return !version.includes(OrcFile.WriterVersion.HIVE_12055); case DECIMAL: return true; + case TIMESTAMP: + return !version.includes(OrcFile.WriterVersion.ORC_135); default: return false; } diff --git a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java index 44c332f049..5c05adea69 100644 --- a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java +++ b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java @@ -23,6 +23,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.math.BigInteger; +import java.util.TimeZone; public final class SerializationUtils { @@ -1308,4 +1309,14 @@ private long readLongBE8(InStream in, int rbOffset) { public boolean isSafeSubtract(long left, long right) { return (left ^ right) >= 0 | (left ^ (left - right)) >= 0; } + + public static long convertFromUtc(TimeZone local, long time) { + int offset = local.getOffset(time - local.getRawOffset()); + return time - offset; + } + + public static long convertToUtc(TimeZone local, long time) { + int offset = local.getOffset(time); + return time + offset; + } } diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 0308393c62..4b369afe3a 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -19,7 +19,7 @@ import java.io.EOFException; import java.io.IOException; -import java.math.BigInteger; +import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.HashMap; @@ -863,12 +863,12 @@ void skipRows(long items) throws IOException { public static class TimestampTreeReader extends TreeReader { protected IntegerReader data = null; protected IntegerReader nanos = null; - private final boolean skipCorrupt; private Map baseTimestampMap; protected long base_timestamp; private final TimeZone readerTimeZone; private TimeZone writerTimeZone; private boolean hasSameTZRules; + private ThreadLocal threadLocalDateFormat; TimestampTreeReader(int columnId, Context context) throws IOException { this(columnId, null, null, null, null, context); @@ -878,7 +878,8 @@ protected TimestampTreeReader(int columnId, InStream presentStream, InStream dat InStream nanosStream, OrcProto.ColumnEncoding encoding, Context context) throws IOException { super(columnId, presentStream, context); - this.skipCorrupt = context.isSkipCorrupt(); + this.threadLocalDateFormat = new ThreadLocal<>(); + this.threadLocalDateFormat.set(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")); this.baseTimestampMap = new HashMap<>(); this.readerTimeZone = TimeZone.getDefault(); if (context.getWriterTimezone() == null || context.getWriterTimezone().isEmpty()) { @@ -934,17 +935,16 @@ protected long getBaseTimestamp(String timeZoneId) throws IOException { if (!baseTimestampMap.containsKey(timeZoneId)) { writerTimeZone = TimeZone.getTimeZone(timeZoneId); hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - sdf.setTimeZone(writerTimeZone); + threadLocalDateFormat.get().setTimeZone(writerTimeZone); try { - long epoch = - sdf.parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND; + long epoch = threadLocalDateFormat.get() + .parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND; baseTimestampMap.put(timeZoneId, epoch); return epoch; } catch (ParseException e) { throw new IOException("Unable to create base timestamp", e); } finally { - sdf.setTimeZone(readerTimeZone); + threadLocalDateFormat.get().setTimeZone(readerTimeZone); } } diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 925164d5e9..ce955e3548 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -99,9 +99,6 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private static final int MIN_ROW_INDEX_STRIDE = 1000; - // threshold above which buffer size will be automatically resized - private static final int COLUMN_COUNT_THRESHOLD = 1000; - private final Path path; private final long defaultStripeSize; private long adjustedStripeSize; @@ -479,7 +476,7 @@ private abstract static class TreeWriter { protected final boolean createBloomFilter; private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex; private final OrcProto.BloomFilterIndex.Builder bloomFilterIndexUtf8; - private final OrcProto.BloomFilter.Builder bloomFilterEntry; + protected final OrcProto.BloomFilter.Builder bloomFilterEntry; private boolean foundNulls; private OutStream isPresentOutStream; private final List stripeStatsBuilders; @@ -735,9 +732,14 @@ TreeWriter[] getChildrenWriters() { * Get the encoding for this column. * @return the information about the encoding of this column */ - OrcProto.ColumnEncoding getEncoding() { - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DIRECT).build(); + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder builder = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT); + if (createBloomFilter) { + builder.setBloomEncoding(BloomFilterIO.Encoding.CURRENT.getId()); + } + return builder; } /** @@ -956,13 +958,14 @@ private static class IntegerTreeWriter extends TreeWriter { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -1338,25 +1341,23 @@ public void visit(StringRedBlackTree.VisitorContext context } @Override - OrcProto.ColumnEncoding getEncoding() { - // Returns the encoding used for the last call to writeStripe + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (useDictionaryEncoding) { + result.setDictionarySize(dictionary.size()); if(isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DICTIONARY_V2). - setDictionarySize(dictionary.size()).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY); } - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DICTIONARY). - setDictionarySize(dictionary.size()).build(); } else { if(isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder().setKind( - OrcProto.ColumnEncoding.Kind.DIRECT).build(); } + return result; } /** @@ -1673,13 +1674,14 @@ private static class BinaryTreeWriter extends TreeWriter { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -1750,17 +1752,15 @@ long estimateMemory() { } } - public static long MILLIS_PER_DAY = 24 * 60 * 60 * 1000; - public static long NANOS_PER_MILLI = 1000000; public static final int MILLIS_PER_SECOND = 1000; - static final int NANOS_PER_SECOND = 1000000000; public static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00"; private static class TimestampTreeWriter extends TreeWriter { private final IntegerWriter seconds; private final IntegerWriter nanos; private final boolean isDirectV2; - private final long base_timestamp; + private final TimeZone localTimezone; + private final long baseEpochSecsLocalTz; TimestampTreeWriter(int columnId, TypeDescription schema, @@ -1775,19 +1775,21 @@ private static class TimestampTreeWriter extends TreeWriter { if (rowIndexPosition != null) { recordPosition(rowIndexPosition); } + this.localTimezone = TimeZone.getDefault(); // for unit tests to set different time zones - this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; + this.baseEpochSecsLocalTz = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; writer.useWriterTimeZone(true); } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -1800,14 +1802,15 @@ void writeBatch(ColumnVector vector, int offset, if (vector.noNulls || !vector.isNull[0]) { val = vec.asScratchTimestamp(0); long millis = val.getTime(); - indexStatistics.updateTimestamp(millis); + long utc = SerializationUtils.convertToUtc(localTimezone, millis); + indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { bloomFilter.addLong(millis); } - bloomFilterUtf8.addLong(millis); + bloomFilterUtf8.addLong(utc); } - final long secs = millis / MILLIS_PER_SECOND - base_timestamp; + final long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; final long nano = formatNanos(val.getNanos()); for(int i=0; i < length; ++i) { seconds.write(secs); @@ -1819,15 +1822,16 @@ void writeBatch(ColumnVector vector, int offset, if (vec.noNulls || !vec.isNull[i + offset]) { val = vec.asScratchTimestamp(i + offset); long millis = val.getTime(); - long secs = millis / MILLIS_PER_SECOND - base_timestamp; + long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; + long utc = SerializationUtils.convertToUtc(localTimezone, millis); seconds.write(secs); nanos.write(formatNanos(val.getNanos())); - indexStatistics.updateTimestamp(millis); + indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { bloomFilter.addLong(millis); } - bloomFilterUtf8.addLong(millis); + bloomFilterUtf8.addLong(utc); } } } @@ -1946,13 +1950,14 @@ void recordPosition(PositionRecorder recorder) throws IOException { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -1988,13 +1993,14 @@ private static class DecimalTreeWriter extends TreeWriter { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -2169,13 +2175,14 @@ private static class ListTreeWriter extends TreeWriter { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override @@ -2282,13 +2289,14 @@ private static class MapTreeWriter extends TreeWriter { } @Override - OrcProto.ColumnEncoding getEncoding() { + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + return result; } @Override diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java index a6c3940ef2..74e9b0a908 100644 --- a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java +++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java @@ -28,6 +28,38 @@ public class BloomFilterIO { + public enum Encoding { + ORIGINAL(0), + UTF8_UTC(1), + FUTURE(Integer.MAX_VALUE); + + public static final Encoding CURRENT = UTF8_UTC; + + private final int id; + + Encoding(int id) { + this.id = id; + } + + public int getId() { + return id; + } + + public static Encoding from(OrcProto.ColumnEncoding encoding) { + if (!encoding.hasBloomEncoding()) { + return ORIGINAL; + } + switch (encoding.getBloomEncoding()) { + case 0: + return ORIGINAL; + case 1: + return UTF8_UTC; + default: + return FUTURE; + } + } + } + private BloomFilterIO() { // never called } @@ -36,6 +68,7 @@ private BloomFilterIO() { * Deserialize a bloom filter from the ORC file. */ public static BloomFilter deserialize(OrcProto.Stream.Kind kind, + OrcProto.ColumnEncoding encoding, OrcFile.WriterVersion fileVersion, TypeDescription.Category type, OrcProto.BloomFilter bloomFilter) { @@ -60,6 +93,13 @@ public static BloomFilter deserialize(OrcProto.Stream.Kind kind, return new BloomFilter(values, numFuncs); } case BLOOM_FILTER_UTF8: { + // make sure we don't use unknown encodings or original timestamp encodings + Encoding version = Encoding.from(encoding); + if (version == Encoding.FUTURE || + (type == TypeDescription.Category.TIMESTAMP && + version == Encoding.ORIGINAL)) { + return null; + } ByteString bits = bloomFilter.getUtf8Bitset(); long[] values = new long[bits.size() / 8]; bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN) diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index b933bd684c..fd87b4ca34 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -25,7 +25,10 @@ import java.io.FileOutputStream; import java.io.PrintStream; import java.sql.Timestamp; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.List; +import java.util.TimeZone; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -141,9 +144,11 @@ public void testDateMerge() throws Exception { } @Test - public void testTimestampMerge() throws Exception { + public void testTimestampMergeUTC() throws Exception { TypeDescription schema = TypeDescription.createTimestamp(); + TimeZone original = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); stats1.updateTimestamp(new Timestamp(10)); @@ -160,6 +165,56 @@ public void testTimestampMerge() throws Exception { stats1.merge(stats2); assertEquals(-10, typed.getMinimum().getTime()); assertEquals(10000, typed.getMaximum().getTime()); + TimeZone.setDefault(original); + } + + private static final String TIME_FORMAT = "yyyy-MM-dd HH:mm:ss"; + private final SimpleDateFormat format = new SimpleDateFormat(TIME_FORMAT); + + private Timestamp parseTime(String value) { + try { + return new Timestamp(format.parse(value).getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("bad time parse for " + value, e); + } + } + + @Test + public void testTimestampMergeLA() throws Exception { + TypeDescription schema = TypeDescription.createTimestamp(); + + TimeZone original = TimeZone.getDefault(); + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")); + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema); + ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema); + stats1.updateTimestamp(parseTime("2000-04-02 03:30:00")); + stats1.updateTimestamp(parseTime("2000-04-02 01:30:00")); + stats1.increment(2); + stats2.updateTimestamp(parseTime("2000-10-29 01:30:00")); + stats2.updateTimestamp(parseTime("2000-10-29 03:30:00")); + stats2.increment(2); + TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1; + assertEquals("2000-04-02 01:30:00.0", typed.getMinimum().toString()); + assertEquals("2000-04-02 03:30:00.0", typed.getMaximum().toString()); + stats1.merge(stats2); + assertEquals("2000-04-02 01:30:00.0", typed.getMinimum().toString()); + assertEquals("2000-10-29 03:30:00.0", typed.getMaximum().toString()); + stats1.reset(); + stats1.updateTimestamp(parseTime("1999-04-04 00:00:00")); + stats1.updateTimestamp(parseTime("2009-03-08 12:00:00")); + stats1.merge(stats2); + assertEquals("1999-04-04 00:00:00.0", typed.getMinimum().toString()); + assertEquals("2009-03-08 12:00:00.0", typed.getMaximum().toString()); + + // serialize and read back in with phoenix timezone + OrcProto.ColumnStatistics serial = stats2.serialize().build(); + TimeZone.setDefault(TimeZone.getTimeZone("America/Phoenix")); + ColumnStatisticsImpl stats3 = ColumnStatisticsImpl.deserialize(serial); + assertEquals("2000-10-29 01:30:00.0", + ((TimestampColumnStatistics) stats3).getMinimum().toString()); + assertEquals("2000-10-29 03:30:00.0", + ((TimestampColumnStatistics) stats3).getMaximum().toString()); + TimeZone.setDefault(original); } @Test diff --git a/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java b/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java new file mode 100644 index 0000000000..c0b7d4487c --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcTimezonePPD.java @@ -0,0 +1,407 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import static junit.framework.Assert.assertEquals; + +import java.io.File; +import java.sql.Timestamp; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.TimeZone; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl; +import org.apache.orc.impl.OrcIndex; +import org.apache.orc.impl.RecordReaderImpl; +import org.apache.orc.impl.SerializationUtils; +import org.apache.orc.util.BloomFilter; +import org.apache.orc.util.BloomFilterIO; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + */ +@RunWith(Parameterized.class) +public class TestOrcTimezonePPD { + private static final Logger LOG = LoggerFactory.getLogger(TestOrcTimezonePPD.class); + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + Configuration conf; + FileSystem fs; + Path testFilePath; + String writerTimeZone; + String readerTimeZone; + static TimeZone defaultTimeZone = TimeZone.getDefault(); + TimeZone utcTz = TimeZone.getTimeZone("UTC"); + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + + public TestOrcTimezonePPD(String writerTZ, String readerTZ) { + this.writerTimeZone = writerTZ; + this.readerTimeZone = readerTZ; + } + + @Parameterized.Parameters + public static Collection data() { + List result = Arrays.asList(new Object[][]{ + {"US/Eastern", "America/Los_Angeles"}, + {"US/Eastern", "UTC"}, + /* Extreme timezones */ + {"GMT-12:00", "GMT+14:00"}, + /* No difference in DST */ + {"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */ + {"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */ + {"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */, + {"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */, + {"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */, + /* With DST difference */ + {"Europe/Berlin", "UTC"}, + {"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */, + {"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */, + {"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */, + /* Timezone offsets for the reader has changed historically */ + {"Asia/Saigon", "Pacific/Enderbury"}, + {"UTC", "Asia/Jerusalem"}, + }); + return result; + } + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @After + public void restoreTimeZone() { + TimeZone.setDefault(defaultTimeZone); + } + + public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, + PredicateLeaf.Type type, + String columnName, + Object literal, + List literalList) { + return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, + literal, literalList); + } + + @Test + public void testTimestampPPDMinMax() throws Exception { + TypeDescription schema = TypeDescription.createTimestamp(); + + TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + assertEquals(writerTimeZone, TimeZone.getDefault().getID()); + List ts = Lists.newArrayList(); + ts.add("2007-08-01 00:00:00.0"); + ts.add("2007-08-01 04:00:00.0"); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; + for (String t : ts) { + times.set(batch.size++, Timestamp.valueOf(t)); + } + writer.addRowBatch(batch); + writer.close(); + + TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(readerTimeZone, TimeZone.getDefault().getID()); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + times = (TimestampColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString()); + } + } + rows.close(); + ColumnStatistics[] colStats = reader.getStatistics(); + Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum(); + assertEquals("2007-08-01 00:00:00.0", gotMin.toString()); + + Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum(); + assertEquals("2007-08-01 04:00:00.0", gotMax.toString()); + + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().equals + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0")).build().getLeaves().get(0), + null)); + + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().equals + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0")).build().getLeaves().get(0), + null)); + + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().between + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 05:00:00.0"), + Timestamp.valueOf("2007-08-01 06:00:00.0")).build().getLeaves().get(0), + null)); + + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().between + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"), + Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0), + null)); + + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().in + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"), + Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0), + null)); + + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0], + SearchArgumentFactory.newBuilder().in + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0"), + Timestamp.valueOf("2007-08-02 03:00:00.0")).build().getLeaves().get(0), + null)); + } + + static OrcProto.ColumnEncoding buildEncoding() { + OrcProto.ColumnEncoding.Builder result = + OrcProto.ColumnEncoding.newBuilder(); + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .setBloomEncoding(BloomFilterIO.Encoding.UTF8_UTC.getId()); + return result.build(); + } + + @Test + public void testTimestampPPDBloomFilter() throws Exception { + LOG.info("Writer = " + writerTimeZone + " reader = " + readerTimeZone); + TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp()); + + TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000).bloomFilterColumns("ts").writerVersion(OrcFile.WriterVersion.ORC_101)); + assertEquals(writerTimeZone, TimeZone.getDefault().getID()); + List ts = Lists.newArrayList(); + ts.add("2007-08-01 00:00:00.0"); + ts.add("2007-08-01 04:00:00.0"); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; + for (String t : ts) { + times.set(batch.size++, Timestamp.valueOf(t)); + } + writer.addRowBatch(batch); + writer.close(); + + TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(readerTimeZone, TimeZone.getDefault().getID()); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + times = (TimestampColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString()); + } + } + boolean[] sargColumns = new boolean[2]; + Arrays.fill(sargColumns, true); + OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns); + rows.close(); + ColumnStatistics[] colStats = reader.getStatistics(); + Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum(); + assertEquals("2007-08-01 00:00:00.0", gotMin.toString()); + + Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum(); + assertEquals("2007-08-01 04:00:00.0", gotMax.toString()); + + OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex(); + OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0); + BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, + buildEncoding(), reader.getWriterVersion(), + TypeDescription.Category.TIMESTAMP, bloomFilter); + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], + SearchArgumentFactory.newBuilder().equals + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0")).build().getLeaves().get(0), + bf)); + + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], + SearchArgumentFactory.newBuilder().equals + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0")).build().getLeaves().get(0), + bf)); + + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], + SearchArgumentFactory.newBuilder().in + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"), + Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0), + bf)); + + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], + SearchArgumentFactory.newBuilder().in + ("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0"), + Timestamp.valueOf("2007-08-02 03:00:00.0")).build().getLeaves().get(0), + bf)); + } + + @Test + public void testTimestampMinMaxAndBloomFilter() throws Exception { + TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp()); + + TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000).bloomFilterColumns("ts")); + assertEquals(writerTimeZone, TimeZone.getDefault().getID()); + List ts = Lists.newArrayList(); + ts.add("2007-08-01 00:00:00.0"); + ts.add("2007-08-01 04:00:00.0"); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; + for (String t : ts) { + times.set(batch.size++, Timestamp.valueOf(t)); + } + writer.addRowBatch(batch); + writer.close(); + + TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(readerTimeZone, TimeZone.getDefault().getID()); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + times = (TimestampColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString()); + } + } + boolean[] sargColumns = new boolean[2]; + Arrays.fill(sargColumns, true); + OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns); + rows.close(); + ColumnStatistics[] colStats = reader.getStatistics(); + Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum(); + assertEquals("2007-08-01 00:00:00.0", gotMin.toString()); + + Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum(); + assertEquals("2007-08-01 04:00:00.0", gotMax.toString()); + + OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex(); + OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0); + BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, + buildEncoding(), reader.getWriterVersion(), + TypeDescription.Category.TIMESTAMP, bloomFilter); + PredicateLeaf pred = createPredicateLeaf( + PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", + Timestamp.valueOf("2007-08-01 00:00:00.0"), null); + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", + Timestamp.valueOf("2007-08-01 02:00:00.0"), null); + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + bf.addLong(SerializationUtils.convertToUtc(TimeZone.getDefault(), + Timestamp.valueOf("2007-08-01 02:00:00.0").getTime())); + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.TIMESTAMP, "x", + Timestamp.valueOf("2007-08-01 00:00:00.0"), null); + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", + Timestamp.valueOf("2007-08-01 00:00:00.0"), null); + Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + pred = createPredicateLeaf(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.TIMESTAMP, "x", null, null); + Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + } + + @Test + public void testTimestampAllNulls() throws Exception { + TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp()); + + TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone)); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000).bloomFilterColumns("ts")); + assertEquals(writerTimeZone, TimeZone.getDefault().getID()); + VectorizedRowBatch batch = schema.createRowBatch(); + TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; + for (int i = 0; i < 3; i++) { + times.set(batch.size++, null); + } + writer.addRowBatch(batch); + writer.close(); + + TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone)); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(readerTimeZone, TimeZone.getDefault().getID()); + RecordReader rows = reader.rows(); + boolean[] sargColumns = new boolean[2]; + Arrays.fill(sargColumns, true); + OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns); + rows.close(); + ColumnStatistics[] colStats = reader.getStatistics(); + Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum(); + Assert.assertNull(gotMin); + + Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum(); + Assert.assertNull(gotMax); + + OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex(); + OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0); + BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, + buildEncoding(), reader.getWriterVersion(), + TypeDescription.Category.TIMESTAMP, bloomFilter); + PredicateLeaf pred = createPredicateLeaf( + PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", + Timestamp.valueOf("2007-08-01 00:00:00.0"), null); + Assert.assertEquals(SearchArgument.TruthValue.NULL, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + + pred = createPredicateLeaf(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.TIMESTAMP, "x", null, null); + Assert.assertEquals(SearchArgument.TruthValue.YES, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf)); + } +} diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index 6b4daa8b24..354cb89e08 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -33,10 +33,15 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.sql.Date; import java.sql.Timestamp; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.TimeZone; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -64,6 +69,8 @@ import org.apache.orc.Reader; import org.apache.orc.OrcProto; +import org.apache.orc.util.BloomFilterIO; +import org.apache.orc.util.BloomFilterUtf8; import org.junit.Assert; import org.junit.Test; import org.mockito.MockSettings; @@ -343,10 +350,12 @@ private static OrcProto.ColumnStatistics createDateStats(int min, int max) { return OrcProto.ColumnStatistics.newBuilder().setDateStatistics(dateStats.build()).build(); } - private static OrcProto.ColumnStatistics createTimestampStats(long min, long max) { + private static final TimeZone utcTz = TimeZone.getTimeZone("UTC"); + + private static OrcProto.ColumnStatistics createTimestampStats(String min, String max) { OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder(); - tsStats.setMinimum(min); - tsStats.setMaximum(max); + tsStats.setMinimumUtc(getUtcTimestamp(min)); + tsStats.setMaximumUtc(getUtcTimestamp(max)); return OrcProto.ColumnStatistics.newBuilder().setTimestampStatistics(tsStats.build()).build(); } @@ -387,28 +396,96 @@ public void testGetMax() throws Exception { .deserialize(createDecimalStats("111.1", "112.1")))); } + static TruthValue evaluateBoolean(OrcProto.ColumnStatistics stats, + PredicateLeaf predicate) { + OrcProto.ColumnEncoding encoding = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .build(); + return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null, + encoding, null, + OrcFile.WriterVersion.ORC_135, TypeDescription.Category.BOOLEAN); + } + + static TruthValue evaluateInteger(OrcProto.ColumnStatistics stats, + PredicateLeaf predicate) { + OrcProto.ColumnEncoding encoding = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2) + .build(); + return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null, + encoding, null, + OrcFile.WriterVersion.ORC_135, TypeDescription.Category.LONG); + } + + static TruthValue evaluateDouble(OrcProto.ColumnStatistics stats, + PredicateLeaf predicate) { + OrcProto.ColumnEncoding encoding = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .build(); + return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null, + encoding, null, + OrcFile.WriterVersion.ORC_135, TypeDescription.Category.DOUBLE); + } + + static TruthValue evaluateTimestamp(OrcProto.ColumnStatistics stats, + PredicateLeaf predicate, + boolean include135) { + OrcProto.ColumnEncoding encoding = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .build(); + return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null, + encoding, null, + include135 ? OrcFile.WriterVersion.ORC_135: OrcFile.WriterVersion.ORC_101, + TypeDescription.Category.TIMESTAMP); + } + + static TruthValue evaluateTimestampBloomfilter(OrcProto.ColumnStatistics stats, + PredicateLeaf predicate, + BloomFilter bloom, + OrcFile.WriterVersion version) { + OrcProto.ColumnEncoding.Builder encoding = + OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT); + if (version.includes(OrcFile.WriterVersion.ORC_135)) { + encoding.setBloomEncoding(BloomFilterIO.Encoding.UTF8_UTC.getId()); + } + OrcProto.Stream.Kind kind = + version.includes(OrcFile.WriterVersion.ORC_101) ? + OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 : + OrcProto.Stream.Kind.BLOOM_FILTER; + OrcProto.BloomFilter.Builder builder = + OrcProto.BloomFilter.newBuilder(); + BloomFilterIO.serialize(builder, bloom); + return RecordReaderImpl.evaluatePredicateProto(stats, predicate, kind, + encoding.build(), builder.build(), version, + TypeDescription.Category.TIMESTAMP); + } + @Test public void testPredEvalWithBooleanStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 10), pred)); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 0), pred)); pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 10), pred)); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 0), pred)); pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 10), pred)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN)); + evaluateBoolean(createBooleanStats(10, 0), pred)); } @Test @@ -416,34 +493,34 @@ public void testPredEvalWithIntStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); // Stats gets converted to column type. "15" is outside of "10" and "100" pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); // Integer stats will not be converted date because of days/seconds/millis ambiguity pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10, 100), pred)); } @Test @@ -451,39 +528,39 @@ public void testPredEvalWithDoubleStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0" pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); // Double is not converted to date type because of days/seconds/millis ambiguity pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE)); + evaluateDouble(createDoubleStats(10.0, 100.0), pred)); } @Test @@ -491,33 +568,33 @@ public void testPredEvalWithStringStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("10", "1000"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 100.0, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("10", "1000"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "100", null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("10", "1000"), pred)); // IllegalArgumentException is thrown when converting String to Date, hence YES_NO pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 1000), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("10", "1000"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("10", "1000"), pred)); } @Test @@ -526,69 +603,69 @@ public void testPredEvalWithDateStats() throws Exception { PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); // Date to Integer conversion is not possible. assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); // Date to Float conversion is also not possible. pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "1970-01-11", null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15.1", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "__a15__1", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "2000-01-16", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "1970-01-16", null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); // Date to Decimal conversion is also not possible. pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDateStats(10, 100), pred)); } @Test @@ -596,86 +673,86 @@ public void testPredEvalWithDecimalStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); // "15" out of range of "10.0" and "100.0" pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "15", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); // Decimal to Date not possible. pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createDecimalStats("10.0", "100.0"), pred)); } @Test public void testPredEvalWithTimestampStats() throws Exception { PredicateLeaf pred = createPredicateLeaf( - PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); + PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, + "x", Timestamp.valueOf("2017-01-01 00:00:00"), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", + "2018-01-01 00:00:00"), pred, true)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); - assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); - assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"), + pred, true)); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"), + pred, true)); + // pre orc-135 should always be yes_no_null. pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - PredicateLeaf.Type.STRING, "x", "15", null); - assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + PredicateLeaf.Type.TIMESTAMP, "x", Timestamp.valueOf("2017-01-01 00:00:00"), null); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2017-01-01 00:00:00"), + pred, false)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null); + PredicateLeaf.Type.STRING, "x", Timestamp.valueOf("2017-01-01 00:00:00").toString(), null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"), + pred, true)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null); + PredicateLeaf.Type.DATE, "x", Date.valueOf("2016-01-01"), null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2017-01-01 00:00:00"), + pred, true)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L, - 100 * 24L * 60L * 60L * 1000L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateTimestamp(createTimestampStats("2015-01-01 00:00:00", "2016-01-01 00:00:00"), + pred, true)); pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null); - assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); - assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); - - pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); - assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); - assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestamp(createTimestampStats("2015-01-01 00:00:00", "2016-01-01 00:00:00"), + pred, true)); } @Test @@ -684,17 +761,17 @@ public void testEquals() throws Exception { (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 30L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 30L), pred)) ; assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 30L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 15L), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(0L, 10L), pred)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 15L), pred)); } @Test @@ -703,17 +780,17 @@ public void testNullSafeEquals() throws Exception { (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 30L), pred)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 30L), pred)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 30L), pred)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 15L), pred)); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(0L, 10L), pred)); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 15L), pred)); } @Test @@ -722,15 +799,15 @@ public void testLessThan() throws Exception { (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 30L), lessThan)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 30L), lessThan)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 30L), lessThan)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 15L), lessThan)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(0L, 10L), lessThan)); } @Test @@ -739,15 +816,15 @@ public void testLessThanEquals() throws Exception { (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 30L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 30L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 30L), pred)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 15L), pred)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(0L, 10L), pred)); } @Test @@ -759,13 +836,13 @@ public void testIn() throws Exception { (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG, "x", null, args); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 20L), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(30L, 30L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 30L), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(12L, 18L), pred)); } @Test @@ -777,19 +854,19 @@ public void testBetween() throws Exception { (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG, "x", null, args); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(0L, 5L), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(30L, 40L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(5L, 15L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(15L, 25L), pred)); assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(5L, 25L), pred)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(10L, 20L), pred)); assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(12L, 18L), pred)); } @Test @@ -798,7 +875,7 @@ public void testIsNull() throws Exception { (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG, "x", null, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createIntStats(20L, 30L), pred)); } @@ -808,17 +885,17 @@ public void testEqualsWithNullInStats() throws Exception { (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before + evaluateInteger(createStringStats("d", "e", true), pred)); // before assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("b", "c", true), pred)); // max assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min + evaluateInteger(createStringStats("c", "d", true), pred)); // min assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same + evaluateInteger(createStringStats("c", "c", true), pred)); // same } @Test @@ -827,17 +904,17 @@ public void testNullSafeEqualsWithNullInStats() throws Exception { (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before + evaluateInteger(createStringStats("d", "e", true), pred)); // before assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("b", "c", true), pred)); // max assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min + evaluateInteger(createStringStats("c", "d", true), pred)); // min assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same + evaluateInteger(createStringStats("c", "c", true), pred)); // same } @Test @@ -846,17 +923,17 @@ public void testLessThanWithNullInStats() throws Exception { (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before + evaluateInteger(createStringStats("d", "e", true), pred)); // before assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("b", "c", true), pred)); // max assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min + evaluateInteger(createStringStats("c", "d", true), pred)); // min assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.NO_NULL, // min, same stats - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "c", true), pred)); } @Test @@ -865,17 +942,17 @@ public void testLessThanEqualsWithNullInStats() throws Exception { (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING, "x", "c", null); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before + evaluateInteger(createStringStats("d", "e", true), pred)); // before assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("b", "c", true), pred)); // max assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min + evaluateInteger(createStringStats("c", "d", true), pred)); // min assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same + evaluateInteger(createStringStats("c", "c", true), pred)); // same } @Test @@ -887,17 +964,17 @@ public void testInWithNullInStats() throws Exception { (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING, "x", null, args); assertEquals(TruthValue.NO_NULL, // before & after - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("d", "e", true), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("e", "f", true), pred)); // max assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min + evaluateInteger(createStringStats("c", "d", true), pred)); // min assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.YES_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same + evaluateInteger(createStringStats("c", "c", true), pred)); // same } @Test @@ -909,31 +986,57 @@ public void testBetweenWithNullInStats() throws Exception { (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING, "x", null, args); assertEquals(TruthValue.YES_NULL, // before & after - RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("d", "e", true), pred)); assertEquals(TruthValue.YES_NULL, // before & max - RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("e", "f", true), pred)); assertEquals(TruthValue.NO_NULL, // before & before - RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("h", "g", true), pred)); assertEquals(TruthValue.YES_NO_NULL, // before & min - RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("f", "g", true), pred)); assertEquals(TruthValue.YES_NO_NULL, // before & middle - RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("e", "g", true), pred)); assertEquals(TruthValue.YES_NULL, // min & after - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "e", true), pred)); assertEquals(TruthValue.YES_NULL, // min & max - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "f", true), pred)); assertEquals(TruthValue.YES_NO_NULL, // min & middle - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "g", true), pred)); assertEquals(TruthValue.NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after + evaluateInteger(createStringStats("a", "b", true), pred)); // after assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max + evaluateInteger(createStringStats("a", "c", true), pred)); // max assertEquals(TruthValue.YES_NO_NULL, - RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle + evaluateInteger(createStringStats("b", "d", true), pred)); // middle assertEquals(TruthValue.YES_NULL, // min & after, same stats - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "c", true), pred)); + } + + @Test + public void testTimestampStatsOldFiles() throws Exception { + PredicateLeaf pred = createPredicateLeaf + (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, + "x", Timestamp.valueOf("2000-01-01 00:00:00"), null); + OrcProto.ColumnStatistics cs = createTimestampStats("2000-01-01 00:00:00", "2001-01-01 00:00:00"); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestampBloomfilter(cs, pred, new BloomFilterUtf8(10000, 0.01), OrcFile.WriterVersion.ORC_101)); + BloomFilterUtf8 bf = new BloomFilterUtf8(10, 0.05); + bf.addLong(getUtcTimestamp("2000-06-01 00:00:00")); + assertEquals(TruthValue.NO_NULL, + evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_135)); + assertEquals(TruthValue.YES_NO_NULL, + evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_101)); + } + + private static long getUtcTimestamp(String ts) { + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + dateFormat.setTimeZone(utcTz); + try { + return dateFormat.parse(ts).getTime(); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + ts, e); + } } @Test @@ -942,9 +1045,9 @@ public void testIsNullWithNullInStats() throws Exception { (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING, "x", null, null); assertEquals(TruthValue.YES_NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "d", true), pred)); assertEquals(TruthValue.NO, - RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); + evaluateInteger(createStringStats("c", "d", false), pred)); } @Test @@ -1527,77 +1630,6 @@ public void testDateWritableInBloomFilter() throws Exception { assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); } - @Test - public void testTimestampNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = createPredicateLeaf( - PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", - new Timestamp(15), - null); - BloomFilter bf = new BloomFilter(10000); - for (int i = 20; i < 1000; i++) { - bf.addLong((new Timestamp(i)).getTime()); - } - ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100)); - assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - - bf.addLong((new Timestamp(15)).getTime()); - assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - } - - @Test - public void testTimestampEqualsBloomFilter() throws Exception { - PredicateLeaf pred = createPredicateLeaf( - PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); - BloomFilter bf = new BloomFilter(10000); - for (int i = 20; i < 1000; i++) { - bf.addLong((new Timestamp(i)).getTime()); - } - ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100)); - assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - - bf.addLong((new Timestamp(15)).getTime()); - assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - } - - @Test - public void testTimestampInBloomFilter() throws Exception { - List args = new ArrayList(); - args.add(new Timestamp(15)); - args.add(new Timestamp(19)); - PredicateLeaf pred = createPredicateLeaf - (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP, - "x", null, args); - BloomFilter bf = new BloomFilter(10000); - for (int i = 20; i < 1000; i++) { - bf.addLong((new Timestamp(i)).getTime()); - } - ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100)); - assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - - bf.addLong((new Timestamp(19)).getTime()); - assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - - bf.addLong((new Timestamp(15)).getTime()); - assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - } - - @Test - public void testDecimalNullSafeEqualsBloomFilter() throws Exception { - PredicateLeaf pred = createPredicateLeaf( - PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", - new HiveDecimalWritable("15"), - null); - BloomFilter bf = new BloomFilter(10000); - for (int i = 20; i < 1000; i++) { - bf.addString(HiveDecimal.create(i).toString()); - } - ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200")); - assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - - bf.addString(HiveDecimal.create(15).toString()); - assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); - } - @Test public void testDecimalEqualsBloomFilter() throws Exception { PredicateLeaf pred = createPredicateLeaf( diff --git a/java/core/src/test/org/apache/orc/util/TestBloomFilter.java b/java/core/src/test/org/apache/orc/util/TestBloomFilter.java index fcfc8f53ea..8ce99e2cf1 100644 --- a/java/core/src/test/org/apache/orc/util/TestBloomFilter.java +++ b/java/core/src/test/org/apache/orc/util/TestBloomFilter.java @@ -73,10 +73,15 @@ public void testBloomFilterSerialize() { byte[] expected = new byte[]{0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (byte) 0x80, ~0x01, ~0x02, ~0x04, ~0x08, ~0x10, ~0x20, ~0x40, (byte) ~0x80}; + OrcProto.ColumnEncoding.Builder encoding = + OrcProto.ColumnEncoding.newBuilder(); + encoding.setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .setBloomEncoding(BloomFilterIO.Encoding.UTF8_UTC.getId()); assertArrayEquals(expected, bs.toByteArray()); BloomFilter rebuilt = BloomFilterIO.deserialize( OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, - OrcFile.WriterVersion.ORC_101, + encoding.build(), + OrcFile.WriterVersion.ORC_135, TypeDescription.Category.INT, proto); assertEquals(bloom, rebuilt); diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index 7b79ce5a46..e187d7eadd 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -387,7 +387,8 @@ private static void printMetaDataImpl(final String filename, buf.append(rowIdxString); String bloomFilString = getFormattedBloomFilters(col, indices, reader.getWriterVersion(), - reader.getSchema().findSubtype(col).getCategory()); + reader.getSchema().findSubtype(col).getCategory(), + footer.getColumns(col)); buf.append(bloomFilString); System.out.println(buf); } @@ -610,7 +611,8 @@ private static int indexOf(final byte[] data, final byte[] pattern, final int in private static String getFormattedBloomFilters(int col, OrcIndex index, OrcFile.WriterVersion version, - TypeDescription.Category type) { + TypeDescription.Category type, + OrcProto.ColumnEncoding encoding) { OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex(); StringBuilder buf = new StringBuilder(); BloomFilter stripeLevelBF = null; @@ -619,7 +621,7 @@ private static String getFormattedBloomFilters(int col, OrcIndex index, buf.append("\n Bloom filters for column ").append(col).append(":"); for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { BloomFilter toMerge = BloomFilterIO.deserialize( - index.getBloomFilterKinds()[col], version, type, bf); + index.getBloomFilterKinds()[col], encoding, version, type, bf); buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge)); if (stripeLevelBF == null) { stripeLevelBF = toMerge; diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index 0de07ad282..4ea9463cd1 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -193,7 +193,8 @@ public static void printJsonMetaData(List files, writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); writeBloomFilterIndexes(writer, col, indices, reader.getWriterVersion(), - reader.getSchema().findSubtype(col).getCategory()); + reader.getSchema().findSubtype(col).getCategory(), + footer.getColumns(col)); writer.endObject(); } writer.endArray(); @@ -344,7 +345,8 @@ private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs private static void writeBloomFilterIndexes(JSONWriter writer, int col, OrcIndex index, OrcFile.WriterVersion version, - TypeDescription.Category type + TypeDescription.Category type, + OrcProto.ColumnEncoding encoding ) throws JSONException { BloomFilter stripeLevelBF = null; @@ -356,7 +358,7 @@ private static void writeBloomFilterIndexes(JSONWriter writer, int col, writer.object(); writer.key("entryId").value(entryIx++); BloomFilter toMerge = BloomFilterIO.deserialize( - index.getBloomFilterKinds()[col], version, type, bf); + index.getBloomFilterKinds()[col], encoding, version, type, bf); writeBloomFilterStats(writer, toMerge); if (stripeLevelBF == null) { stripeLevelBF = toMerge; diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out index e23327acba..dcf29f7946 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_101 +File Version: 0.12 with ORC_135 Rows: 21000 Compression: ZLIB Compression size: 4096 @@ -39,7 +39,7 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 87 index: 749 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 @@ -67,17 +67,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742 - Stream: column 0 section ROW_INDEX start: 64624 length 17 - Stream: column 1 section ROW_INDEX start: 64641 length 164 - Stream: column 2 section ROW_INDEX start: 64805 length 168 - Stream: column 3 section ROW_INDEX start: 64973 length 83 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310 - Stream: column 1 section DATA start: 65366 length 20035 - Stream: column 2 section DATA start: 85401 length 40050 - Stream: column 3 section DATA start: 125451 length 3532 - Stream: column 3 section LENGTH start: 128983 length 25 - Stream: column 3 section DICTIONARY_DATA start: 129008 length 133 + Stripe: offset: 64625 data: 63775 rows: 5000 tail: 87 index: 742 + Stream: column 0 section ROW_INDEX start: 64625 length 17 + Stream: column 1 section ROW_INDEX start: 64642 length 164 + Stream: column 2 section ROW_INDEX start: 64806 length 168 + Stream: column 3 section ROW_INDEX start: 64974 length 83 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 65057 length 310 + Stream: column 1 section DATA start: 65367 length 20035 + Stream: column 2 section DATA start: 85402 length 40050 + Stream: column 3 section DATA start: 125452 length 3532 + Stream: column 3 section LENGTH start: 128984 length 25 + Stream: column 3 section DICTIONARY_DATA start: 129009 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -95,17 +95,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748 - Stream: column 0 section ROW_INDEX start: 129227 length 17 - Stream: column 1 section ROW_INDEX start: 129244 length 163 - Stream: column 2 section ROW_INDEX start: 129407 length 168 - Stream: column 3 section ROW_INDEX start: 129575 length 90 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310 - Stream: column 1 section DATA start: 129975 length 20035 - Stream: column 2 section DATA start: 150010 length 40050 - Stream: column 3 section DATA start: 190060 length 3544 - Stream: column 3 section LENGTH start: 193604 length 25 - Stream: column 3 section DICTIONARY_DATA start: 193629 length 133 + Stripe: offset: 129229 data: 63787 rows: 5000 tail: 87 index: 748 + Stream: column 0 section ROW_INDEX start: 129229 length 17 + Stream: column 1 section ROW_INDEX start: 129246 length 163 + Stream: column 2 section ROW_INDEX start: 129409 length 168 + Stream: column 3 section ROW_INDEX start: 129577 length 90 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 129667 length 310 + Stream: column 1 section DATA start: 129977 length 20035 + Stream: column 2 section DATA start: 150012 length 40050 + Stream: column 3 section DATA start: 190062 length 3544 + Stream: column 3 section LENGTH start: 193606 length 25 + Stream: column 3 section DICTIONARY_DATA start: 193631 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -123,17 +123,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750 - Stream: column 0 section ROW_INDEX start: 193848 length 17 - Stream: column 1 section ROW_INDEX start: 193865 length 165 - Stream: column 2 section ROW_INDEX start: 194030 length 167 - Stream: column 3 section ROW_INDEX start: 194197 length 91 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310 - Stream: column 1 section DATA start: 194598 length 20035 - Stream: column 2 section DATA start: 214633 length 40050 - Stream: column 3 section DATA start: 254683 length 3574 - Stream: column 3 section LENGTH start: 258257 length 25 - Stream: column 3 section DICTIONARY_DATA start: 258282 length 133 + Stripe: offset: 193851 data: 63817 rows: 5000 tail: 86 index: 750 + Stream: column 0 section ROW_INDEX start: 193851 length 17 + Stream: column 1 section ROW_INDEX start: 193868 length 165 + Stream: column 2 section ROW_INDEX start: 194033 length 167 + Stream: column 3 section ROW_INDEX start: 194200 length 91 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 194291 length 310 + Stream: column 1 section DATA start: 194601 length 20035 + Stream: column 2 section DATA start: 214636 length 40050 + Stream: column 3 section DATA start: 254686 length 3574 + Stream: column 3 section LENGTH start: 258260 length 25 + Stream: column 3 section DICTIONARY_DATA start: 258285 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -151,17 +151,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375 - Stream: column 0 section ROW_INDEX start: 258500 length 12 - Stream: column 1 section ROW_INDEX start: 258512 length 38 - Stream: column 2 section ROW_INDEX start: 258550 length 41 - Stream: column 3 section ROW_INDEX start: 258591 length 40 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244 - Stream: column 1 section DATA start: 258875 length 4007 - Stream: column 2 section DATA start: 262882 length 8010 - Stream: column 3 section DATA start: 270892 length 768 - Stream: column 3 section LENGTH start: 271660 length 25 - Stream: column 3 section DICTIONARY_DATA start: 271685 length 133 + Stripe: offset: 258504 data: 12943 rows: 1000 tail: 80 index: 375 + Stream: column 0 section ROW_INDEX start: 258504 length 12 + Stream: column 1 section ROW_INDEX start: 258516 length 38 + Stream: column 2 section ROW_INDEX start: 258554 length 41 + Stream: column 3 section ROW_INDEX start: 258595 length 40 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 258635 length 244 + Stream: column 1 section DATA start: 258879 length 4007 + Stream: column 2 section DATA start: 262886 length 8010 + Stream: column 3 section DATA start: 270896 length 768 + Stream: column 3 section LENGTH start: 271664 length 25 + Stream: column 3 section DICTIONARY_DATA start: 271689 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -172,7 +172,7 @@ Stripes: Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 -File length: 272444 bytes +File length: 272450 bytes Padding length: 0 bytes Padding ratio: 0% ________________________________________________________________________________________________________________________ diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out index 8296382e90..4ea04b57d4 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_101 +File Version: 0.12 with ORC_135 Rows: 21000 Compression: ZLIB Compression size: 4096 @@ -39,7 +39,7 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 108 index: 14949 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 @@ -70,20 +70,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482 - Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940 - Stream: column 0 section ROW_INDEX start: 78842 length 17 - Stream: column 1 section ROW_INDEX start: 78859 length 164 - Stream: column 2 section ROW_INDEX start: 79023 length 168 - Stream: column 2 section BLOOM_FILTER start: 79191 length 6533 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046 - Stream: column 3 section ROW_INDEX start: 91770 length 83 - Stream: column 3 section BLOOM_FILTER start: 91853 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891 - Stream: column 1 section DATA start: 93782 length 20035 - Stream: column 2 section DATA start: 113817 length 40050 - Stream: column 3 section DATA start: 153867 length 3532 - Stream: column 3 section LENGTH start: 157399 length 25 - Stream: column 3 section DICTIONARY_DATA start: 157424 length 133 + Stripe: offset: 78846 data: 63775 rows: 5000 tail: 107 index: 14940 + Stream: column 0 section ROW_INDEX start: 78846 length 17 + Stream: column 1 section ROW_INDEX start: 78863 length 164 + Stream: column 2 section ROW_INDEX start: 79027 length 168 + Stream: column 2 section BLOOM_FILTER start: 79195 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 85728 length 6046 + Stream: column 3 section ROW_INDEX start: 91774 length 83 + Stream: column 3 section BLOOM_FILTER start: 91857 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 92895 length 891 + Stream: column 1 section DATA start: 93786 length 20035 + Stream: column 2 section DATA start: 113821 length 40050 + Stream: column 3 section DATA start: 153871 length 3532 + Stream: column 3 section LENGTH start: 157403 length 25 + Stream: column 3 section DICTIONARY_DATA start: 157428 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -101,20 +101,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205 - Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946 - Stream: column 0 section ROW_INDEX start: 157660 length 17 - Stream: column 1 section ROW_INDEX start: 157677 length 163 - Stream: column 2 section ROW_INDEX start: 157840 length 168 - Stream: column 2 section BLOOM_FILTER start: 158008 length 6533 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046 - Stream: column 3 section ROW_INDEX start: 170587 length 90 - Stream: column 3 section BLOOM_FILTER start: 170677 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891 - Stream: column 1 section DATA start: 172606 length 20035 - Stream: column 2 section DATA start: 192641 length 40050 - Stream: column 3 section DATA start: 232691 length 3544 - Stream: column 3 section LENGTH start: 236235 length 25 - Stream: column 3 section DICTIONARY_DATA start: 236260 length 133 + Stripe: offset: 157668 data: 63787 rows: 5000 tail: 108 index: 14946 + Stream: column 0 section ROW_INDEX start: 157668 length 17 + Stream: column 1 section ROW_INDEX start: 157685 length 163 + Stream: column 2 section ROW_INDEX start: 157848 length 168 + Stream: column 2 section BLOOM_FILTER start: 158016 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 164549 length 6046 + Stream: column 3 section ROW_INDEX start: 170595 length 90 + Stream: column 3 section BLOOM_FILTER start: 170685 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 171723 length 891 + Stream: column 1 section DATA start: 172614 length 20035 + Stream: column 2 section DATA start: 192649 length 40050 + Stream: column 3 section DATA start: 232699 length 3544 + Stream: column 3 section LENGTH start: 236243 length 25 + Stream: column 3 section DICTIONARY_DATA start: 236268 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -132,20 +132,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444 - Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939 - Stream: column 0 section ROW_INDEX start: 236497 length 17 - Stream: column 1 section ROW_INDEX start: 236514 length 165 - Stream: column 2 section ROW_INDEX start: 236679 length 167 - Stream: column 2 section BLOOM_FILTER start: 236846 length 6524 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046 - Stream: column 3 section ROW_INDEX start: 249416 length 91 - Stream: column 3 section BLOOM_FILTER start: 249507 length 1038 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891 - Stream: column 1 section DATA start: 251436 length 20035 - Stream: column 2 section DATA start: 271471 length 40050 - Stream: column 3 section DATA start: 311521 length 3574 - Stream: column 3 section LENGTH start: 315095 length 25 - Stream: column 3 section DICTIONARY_DATA start: 315120 length 133 + Stripe: offset: 236509 data: 63817 rows: 5000 tail: 107 index: 14939 + Stream: column 0 section ROW_INDEX start: 236509 length 17 + Stream: column 1 section ROW_INDEX start: 236526 length 165 + Stream: column 2 section ROW_INDEX start: 236691 length 167 + Stream: column 2 section BLOOM_FILTER start: 236858 length 6524 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 243382 length 6046 + Stream: column 3 section ROW_INDEX start: 249428 length 91 + Stream: column 3 section BLOOM_FILTER start: 249519 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 250557 length 891 + Stream: column 1 section DATA start: 251448 length 20035 + Stream: column 2 section DATA start: 271483 length 40050 + Stream: column 3 section DATA start: 311533 length 3574 + Stream: column 3 section LENGTH start: 315107 length 25 + Stream: column 3 section DICTIONARY_DATA start: 315132 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -163,20 +163,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165 - Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546 - Stream: column 0 section ROW_INDEX start: 315356 length 12 - Stream: column 1 section ROW_INDEX start: 315368 length 38 - Stream: column 2 section ROW_INDEX start: 315406 length 41 - Stream: column 2 section BLOOM_FILTER start: 315447 length 1337 - Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211 - Stream: column 3 section ROW_INDEX start: 317995 length 40 - Stream: column 3 section BLOOM_FILTER start: 318035 length 472 - Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395 - Stream: column 1 section DATA start: 318902 length 4007 - Stream: column 2 section DATA start: 322909 length 8010 - Stream: column 3 section DATA start: 330919 length 768 - Stream: column 3 section LENGTH start: 331687 length 25 - Stream: column 3 section DICTIONARY_DATA start: 331712 length 133 + Stripe: offset: 315372 data: 12943 rows: 1000 tail: 102 index: 3546 + Stream: column 0 section ROW_INDEX start: 315372 length 12 + Stream: column 1 section ROW_INDEX start: 315384 length 38 + Stream: column 2 section ROW_INDEX start: 315422 length 41 + Stream: column 2 section BLOOM_FILTER start: 315463 length 1337 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 316800 length 1211 + Stream: column 3 section ROW_INDEX start: 318011 length 40 + Stream: column 3 section BLOOM_FILTER start: 318051 length 472 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 318523 length 395 + Stream: column 1 section DATA start: 318918 length 4007 + Stream: column 2 section DATA start: 322925 length 8010 + Stream: column 3 section DATA start: 330935 length 768 + Stream: column 3 section LENGTH start: 331703 length 25 + Stream: column 3 section DICTIONARY_DATA start: 331728 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -187,7 +187,7 @@ Stripes: Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294 -File length: 332489 bytes +File length: 332511 bytes Padding length: 0 bytes Padding ratio: 0% ________________________________________________________________________________________________________________________ diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out index b0315b4428..78e02589a4 100644 --- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out +++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_101 +File Version: 0.12 with ORC_135 Rows: 21000 Compression: ZLIB Compression size: 4096 diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index b3e9d127cb..3914f823da 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -1,7 +1,7 @@ { "fileName": "TestFileDump.testDump.orc", "fileVersion": "0.12", - "writerVersion": "ORC_101", + "writerVersion": "ORC_135", "numberOfRows": 21000, "compression": "ZLIB", "compressionBufferSize": 4096, @@ -256,7 +256,7 @@ "offset": 3, "indexLength": 768, "dataLength": 63770, - "footerLength": 88, + "footerLength": 89, "rowCount": 5000 }, "streams": [ @@ -494,77 +494,77 @@ { "stripeNumber": 2, "stripeInformation": { - "offset": 64629, + "offset": 64630, "indexLength": 759, "dataLength": 63763, - "footerLength": 87, + "footerLength": 88, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 64629, + "startOffset": 64630, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 64646, + "startOffset": 64647, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 64812, + "startOffset": 64813, "length": 166 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 64978, + "startOffset": 64979, "length": 100 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 65078, + "startOffset": 65079, "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 65388, + "startOffset": 65389, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 85423, + "startOffset": 85424, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 125473, + "startOffset": 125474, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 125490, + "startOffset": 125491, "length": 3503 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 128993, + "startOffset": 128994, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 129018, + "startOffset": 129019, "length": 133 } ], @@ -735,77 +735,77 @@ { "stripeNumber": 3, "stripeInformation": { - "offset": 129238, + "offset": 129240, "indexLength": 760, "dataLength": 63770, - "footerLength": 88, + "footerLength": 89, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 129238, + "startOffset": 129240, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 129255, + "startOffset": 129257, "length": 164 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 129419, + "startOffset": 129421, "length": 167 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 129586, + "startOffset": 129588, "length": 102 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 129688, + "startOffset": 129690, "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 129998, + "startOffset": 130000, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 150033, + "startOffset": 150035, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 190083, + "startOffset": 190085, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 190100, + "startOffset": 190102, "length": 3510 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 193610, + "startOffset": 193612, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 193635, + "startOffset": 193637, "length": 133 } ], @@ -976,77 +976,77 @@ { "stripeNumber": 4, "stripeInformation": { - "offset": 193856, + "offset": 193859, "indexLength": 771, "dataLength": 63756, - "footerLength": 89, + "footerLength": 90, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 193856, + "startOffset": 193859, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 193873, + "startOffset": 193876, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 194039, + "startOffset": 194042, "length": 171 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 194210, + "startOffset": 194213, "length": 107 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 194317, + "startOffset": 194320, "length": 310 }, { "columnId": 1, "section": "DATA", - "startOffset": 194627, + "startOffset": 194630, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 214662, + "startOffset": 214665, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 254712, + "startOffset": 254715, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 254729, + "startOffset": 254732, "length": 3496 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 258225, + "startOffset": 258228, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 258250, + "startOffset": 258253, "length": 133 } ], @@ -1217,77 +1217,77 @@ { "stripeNumber": 5, "stripeInformation": { - "offset": 258472, + "offset": 258476, "indexLength": 376, "dataLength": 12943, - "footerLength": 83, + "footerLength": 85, "rowCount": 1000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 258472, + "startOffset": 258476, "length": 12 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 258484, + "startOffset": 258488, "length": 38 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 258522, + "startOffset": 258526, "length": 41 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 258563, + "startOffset": 258567, "length": 41 }, { "columnId": 3, "section": "BLOOM_FILTER_UTF8", - "startOffset": 258604, + "startOffset": 258608, "length": 244 }, { "columnId": 1, "section": "DATA", - "startOffset": 258848, + "startOffset": 258852, "length": 4007 }, { "columnId": 2, "section": "DATA", - "startOffset": 262855, + "startOffset": 262859, "length": 8010 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 270865, + "startOffset": 270869, "length": 16 }, { "columnId": 3, "section": "DATA", - "startOffset": 270881, + "startOffset": 270885, "length": 752 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 271633, + "startOffset": 271637, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 271658, + "startOffset": 271662, "length": 133 } ], @@ -1348,7 +1348,7 @@ }] } ], - "fileLength": 272428, + "fileLength": 272434, "paddingLength": 0, "paddingRatio": 0, "status": "OK" diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out index ae8195e842..51105f0dba 100644 --- a/java/tools/src/test/resources/orc-file-dump.out +++ b/java/tools/src/test/resources/orc-file-dump.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_101 +File Version: 0.12 with ORC_135 Rows: 21000 Compression: ZLIB Compression size: 4096 diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out index c02f803e4a..a42a62d864 100644 --- a/java/tools/src/test/resources/orc-file-has-null.out +++ b/java/tools/src/test/resources/orc-file-has-null.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with ORC_101 +File Version: 0.12 with ORC_135 Rows: 20000 Compression: ZLIB Compression size: 4096 diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index de6974ea4b..ee2ca45e27 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -59,6 +59,8 @@ message TimestampStatistics { // min,max values saved as milliseconds since epoch optional sint64 minimum = 1; optional sint64 maximum = 2; + optional sint64 minimumUtc = 3; + optional sint64 maximumUtc = 4; } message BinaryStatistics { @@ -126,6 +128,11 @@ message ColumnEncoding { } optional Kind kind = 1; optional uint32 dictionarySize = 2; + + // The encoding of the bloom filters for this column: + // 0 or missing = none or original + // 1 = ORC-135 (utc for timestamps) + optional uint32 bloomEncoding = 3; } message StripeFooter { @@ -220,6 +227,8 @@ message PostScript { // 2 = HIVE-4243 fixed // 3 = HIVE-12055 fixed // 4 = HIVE-13083 fixed + // 5 = ORC-101 fixed + // 6 = ORC-135 fixed optional uint32 writerVersion = 6; // Leave this last in the record optional string magic = 8000; @@ -232,4 +241,4 @@ message FileTail { optional Footer footer = 2; optional uint64 fileLength = 3; optional uint64 postscriptLength = 4; -} \ No newline at end of file +} From df4801dbf658fff02ce2272198f89dc5c93a17e8 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 7 Feb 2017 14:37:55 -0800 Subject: [PATCH 11/21] Preparing for release 1.3.2. Signed-off-by: Owen O'Malley --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1714d0b769..dc95aee9e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "2-SNAPSHOT") +SET(CPACK_PACKAGE_VERSION_PATCH "2") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index 8fae424c7e..025239070d 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2-SNAPSHOT + 1.3.2 ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index affa963e4a..24c26fc167 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2-SNAPSHOT + 1.3.2 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 8becf3d57f..5860a52934 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.2-SNAPSHOT + 1.3.2 pom Apache ORC @@ -238,17 +238,17 @@ org.apache.orc orc-core - 1.3.2-SNAPSHOT + 1.3.2 org.apache.orc orc-mapreduce - 1.3.2-SNAPSHOT + 1.3.2 org.apache.orc orc-tools - 1.3.2-SNAPSHOT + 1.3.2 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 0908c4dc3b..168bcd1538 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2-SNAPSHOT + 1.3.2 ../pom.xml From e7654fcbb27e5a3e4175009f331c5b92065c43bd Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 7 Feb 2017 17:47:47 -0800 Subject: [PATCH 12/21] ORC-142. Fix TimestampColumnStatistics when reading from old files (pre ORC-135). Fixes #91 Signed-off-by: Owen O'Malley --- .../apache/orc/impl/ColumnStatisticsImpl.java | 2 +- .../orc/impl/TestColumnStatisticsImpl.java | 20 +++++++++++++++++++ java/pom.xml | 2 ++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index f1ed646773..ec874d68ae 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -1123,7 +1123,7 @@ private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl timestampStats.getMaximum()); } if (timestampStats.hasMinimum()) { - maximum = SerializationUtils.convertToUtc(TimeZone.getDefault(), + minimum = SerializationUtils.convertToUtc(TimeZone.getDefault(), timestampStats.getMinimum()); } if (timestampStats.hasMaximumUtc()) { diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index 6165526171..6528e75a35 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -18,11 +18,19 @@ package org.apache.orc.impl; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.OrcFile; import org.apache.orc.OrcProto; +import org.apache.orc.Reader; +import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; import org.junit.Test; +import java.io.IOException; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -61,4 +69,16 @@ private void assertDateStatistics(ColumnStatisticsImpl stat, int count, int mini assertTrue(protoStat.hasMaximum()); assertEquals(maximum, protoStat.getMaximum()); } + + @Test + public void testOldTimestamps() throws IOException { + Path exampleDir = new Path(System.getProperty("example.dir")); + Path file = new Path(exampleDir, "TestOrcFile.testTimestamp.orc"); + Configuration conf = new Configuration(); + Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); + TimestampColumnStatistics stats = + (TimestampColumnStatistics) reader.getStatistics()[0]; + assertEquals("1995-01-01 00:00:00.688", stats.getMinimum().toString()); + assertEquals("2037-01-01 00:00:00.0", stats.getMaximum().toString()); + } } diff --git a/java/pom.xml b/java/pom.xml index 5860a52934..a1fa79c495 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -67,6 +67,7 @@ UTF-8 false ${project.build.directory}/testing-tmp + ${project.basedir}/../../examples 2.6.4 2.2.0 @@ -137,6 +138,7 @@ false ${test.tmp.dir} + ${example.dir} From 731a2ee841f4ee46a193072caf90e50449cb6d91 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Mon, 13 Feb 2017 08:25:13 -0800 Subject: [PATCH 13/21] Update branch-1.3 after 1.3.2 release. --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc95aee9e9..fb84ad0998 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "2") +SET(CPACK_PACKAGE_VERSION_PATCH "3-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index 025239070d..9dace4b77c 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2 + 1.3.3-SNAPSHOT ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 24c26fc167..0696d22d8d 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2 + 1.3.3-SNAPSHOT ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index a1fa79c495..1ed3dec6d5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.2 + 1.3.3-SNAPSHOT pom Apache ORC @@ -240,17 +240,17 @@ org.apache.orc orc-core - 1.3.2 + 1.3.3-SNAPSHOT org.apache.orc orc-mapreduce - 1.3.2 + 1.3.3-SNAPSHOT org.apache.orc orc-tools - 1.3.2 + 1.3.3-SNAPSHOT diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 168bcd1538..ca1ebd1ac9 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.2 + 1.3.3-SNAPSHOT ../pom.xml From e19ed719310b1210e2624ad68124d7f79a71fb01 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 14 Feb 2017 13:08:50 -0800 Subject: [PATCH 14/21] ORC-147. Fix backwards compatibility with Hive 2.1. (omalley) Fixes #92 Signed-off-by: Owen O'Malley --- java/core/pom.xml | 3 ++ .../org/apache/orc/impl/MemoryManager.java | 32 ++++++++++++++ .../org/apache/orc/impl/RecordReaderImpl.java | 43 ++++++++++++++++++- .../org/apache/orc/impl/SchemaEvolution.java | 20 ++++++--- .../apache/orc/impl/TestSchemaEvolution.java | 22 +++++----- 5 files changed, 103 insertions(+), 17 deletions(-) create mode 100644 java/core/src/java/org/apache/orc/impl/MemoryManager.java diff --git a/java/core/pom.xml b/java/core/pom.xml index 9dace4b77c..2a4748efe7 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -102,6 +102,9 @@ org.apache.maven.plugins maven-javadoc-plugin + + **/OrcProto.java + ${project.artifactId} diff --git a/java/core/src/java/org/apache/orc/impl/MemoryManager.java b/java/core/src/java/org/apache/orc/impl/MemoryManager.java new file mode 100644 index 0000000000..30535bf087 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/MemoryManager.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.conf.Configuration; + +/** + * Shim for backwards compatibility with Hive + */ +@Deprecated +public class MemoryManager extends MemoryManagerImpl { + + public MemoryManager(Configuration conf) { + super(conf); + } +} diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 766eb00c0f..f75d70aa51 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -138,6 +138,47 @@ public static int[] mapSargColumnsToOrcInternalColIdx(List sargLe return result; } + /** + * Given a list of column names, find the given column and return the index. + * + * @param columnNames the list of potential column names + * @param columnName the column name to look for + * @param rootColumn offset the result with the rootColumn + * @return the column number or -1 if the column wasn't found + */ + private static int findColumns(String[] columnNames, + String columnName, + int rootColumn) { + for(int i=0; i < columnNames.length; ++i) { + if (columnName.equals(columnNames[i])) { + return i + rootColumn; + } + } + return -1; + } + + /** + * Find the mapping from predicate leaves to columns. + * @param sargLeaves the search argument that we need to map + * @param columnNames the names of the columns + * @param rootColumn the offset of the top level row, which offsets the + * result + * @return an array mapping the sarg leaves to concrete column numbers + * @deprecated Use #mapSargColumnsToOrcInternalColIdx(List, SchemaEvolution) + */ + @Deprecated + public static int[] mapSargColumnsToOrcInternalColIdx(List sargLeaves, + String[] columnNames, + int rootColumn) { + int[] result = new int[sargLeaves.size()]; + Arrays.fill(result, -1); + for(int i=0; i < result.length; ++i) { + String colName = sargLeaves.get(i).getColumnName(); + result[i] = findColumns(columnNames, colName, rootColumn); + } + return result; + } + protected RecordReaderImpl(ReaderImpl fileReader, Reader.Options options) throws IOException { this.writerVersion = fileReader.getWriterVersion(); @@ -146,7 +187,7 @@ protected RecordReaderImpl(ReaderImpl fileReader, LOG.info("Reader schema not provided -- using file schema " + fileReader.getSchema()); } - evolution = new SchemaEvolution(fileReader.getSchema(), options); + evolution = new SchemaEvolution(fileReader.getSchema(), null, options); } else { // Now that we are creating a record reader for a file, validate that diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index a438db3abf..886d58539e 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.regex.Pattern; +import org.apache.hadoop.conf.Configuration; import org.apache.orc.Reader; import org.apache.orc.TypeDescription; import org.slf4j.Logger; @@ -62,11 +63,6 @@ public IllegalEvolutionException(String msg) { } } - public SchemaEvolution(TypeDescription fileSchema, - Reader.Options options) { - this(fileSchema, null, options); - } - public SchemaEvolution(TypeDescription fileSchema, TypeDescription readerSchema, Reader.Options options) { @@ -131,6 +127,20 @@ public SchemaEvolution(TypeDescription fileSchema, this.ppdSafeConversion = populatePpdSafeConversion(); } + @Deprecated + public SchemaEvolution(TypeDescription fileSchema, boolean[] readerIncluded) { + this(fileSchema, null, readerIncluded); + } + + @Deprecated + public SchemaEvolution(TypeDescription fileSchema, + TypeDescription readerSchema, + boolean[] readerIncluded) { + this(fileSchema, readerSchema, + new Reader.Options(new Configuration()) + .include(readerIncluded)); + } + // Return true iff all fields have names like _col[0-9]+ private boolean hasColumnNames(TypeDescription fileSchema) { if (fileSchema.getCategory() != TypeDescription.Category.STRUCT) { diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java index ac0115da8c..82f823aff9 100644 --- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -78,7 +78,7 @@ public void testDataTypeConversion1() throws IOException { .addField("f1", TypeDescription.createInt()) .addField("f2", TypeDescription.createString()) .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10)); - SchemaEvolution same1 = new SchemaEvolution(fileStruct1, options); + SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null, options); assertFalse(same1.hasConversion()); TypeDescription readerStruct1 = TypeDescription.createStruct() .addField("f1", TypeDescription.createInt()) @@ -113,7 +113,7 @@ public void testDataTypeConversion2() throws IOException { .addField("f4", TypeDescription.createDouble()) .addField("f5", TypeDescription.createBoolean())) .addField("f6", TypeDescription.createChar().withMaxLength(100)); - SchemaEvolution same2 = new SchemaEvolution(fileStruct2, options); + SchemaEvolution same2 = new SchemaEvolution(fileStruct2, null, options); assertFalse(same2.hasConversion()); TypeDescription readerStruct2 = TypeDescription.createStruct() .addField("f1", TypeDescription.createUnion() @@ -186,7 +186,7 @@ public void testSafePpdEvaluation() throws IOException { .addField("f1", TypeDescription.createInt()) .addField("f2", TypeDescription.createString()) .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10)); - SchemaEvolution same1 = new SchemaEvolution(fileStruct1, options); + SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null, options); assertTrue(same1.isPPDSafeConversion(0)); assertFalse(same1.hasConversion()); TypeDescription readerStruct1 = TypeDescription.createStruct() @@ -247,7 +247,7 @@ public void testSafePpdEvaluationForInts() throws IOException { // byte -> short -> int -> long TypeDescription fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createByte()); - SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, options); + SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertFalse(schemaEvolution.hasConversion()); // byte -> short @@ -277,7 +277,7 @@ public void testSafePpdEvaluationForInts() throws IOException { // short -> int -> long fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createShort()); - schemaEvolution = new SchemaEvolution(fileSchema, options); + schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertFalse(schemaEvolution.hasConversion()); // unsafe conversion short -> byte @@ -307,7 +307,7 @@ public void testSafePpdEvaluationForInts() throws IOException { // int -> long fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createInt()); - schemaEvolution = new SchemaEvolution(fileSchema, options); + schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertFalse(schemaEvolution.hasConversion()); // unsafe conversion int -> byte @@ -337,7 +337,7 @@ public void testSafePpdEvaluationForInts() throws IOException { // long fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createLong()); - schemaEvolution = new SchemaEvolution(fileSchema, options); + schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertTrue(schemaEvolution.isPPDSafeConversion(0)); assertFalse(schemaEvolution.hasConversion()); @@ -394,7 +394,7 @@ public void testSafePpdEvaluationForInts() throws IOException { public void testSafePpdEvaluationForStrings() throws IOException { TypeDescription fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createString()); - SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, options); + SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertTrue(schemaEvolution.isPPDSafeConversion(0)); assertFalse(schemaEvolution.hasConversion()); @@ -416,7 +416,7 @@ public void testSafePpdEvaluationForStrings() throws IOException { fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createChar()); - schemaEvolution = new SchemaEvolution(fileSchema, options); + schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertTrue(schemaEvolution.isPPDSafeConversion(0)); assertFalse(schemaEvolution.hasConversion()); @@ -438,7 +438,7 @@ public void testSafePpdEvaluationForStrings() throws IOException { fileSchema = TypeDescription.createStruct() .addField("f1", TypeDescription.createVarchar()); - schemaEvolution = new SchemaEvolution(fileSchema, options); + schemaEvolution = new SchemaEvolution(fileSchema, null, options); assertTrue(schemaEvolution.isPPDSafeConversion(0)); assertFalse(schemaEvolution.hasConversion()); @@ -994,7 +994,7 @@ public void testNonAcidPositionSubstructure() { public void testFileIncludeWithNoEvolution() { TypeDescription fileType = TypeDescription.fromString( "struct"); - SchemaEvolution evo = new SchemaEvolution(fileType, + SchemaEvolution evo = new SchemaEvolution(fileType, null, options.include(new boolean[]{true, false, true, false})); assertFalse(evo.isAcid()); assertEquals("struct", From fdcfae843033f2317165843eac507c028b4420e1 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Thu, 16 Feb 2017 11:03:32 -0800 Subject: [PATCH 15/21] Preparing for 1.3.3 release. Signed-off-by: Owen O'Malley --- CMakeLists.txt | 2 +- java/core/pom.xml | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb84ad0998..7fd08bb620 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ project(ORC) # Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "3") -SET(CPACK_PACKAGE_VERSION_PATCH "3-SNAPSHOT") +SET(CPACK_PACKAGE_VERSION_PATCH "3") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") option (BUILD_JAVA diff --git a/java/core/pom.xml b/java/core/pom.xml index 2a4748efe7..6b29581073 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3-SNAPSHOT + 1.3.3 ../pom.xml diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 0696d22d8d..93aea43eb1 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3-SNAPSHOT + 1.3.3 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 1ed3dec6d5..128789cf80 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.3-SNAPSHOT + 1.3.3 pom Apache ORC @@ -240,17 +240,17 @@ org.apache.orc orc-core - 1.3.3-SNAPSHOT + 1.3.3 org.apache.orc orc-mapreduce - 1.3.3-SNAPSHOT + 1.3.3 org.apache.orc orc-tools - 1.3.3-SNAPSHOT + 1.3.3 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index ca1ebd1ac9..154b1a6569 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3-SNAPSHOT + 1.3.3 ../pom.xml From a13ebed11b1d7149856af7ad522793e5ec5c3c12 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Thu, 16 Feb 2017 11:14:42 -0800 Subject: [PATCH 16/21] ORC-149. Update storage-api to 2.2.1. Signed-off-by: Owen O'Malley --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 128789cf80..bf15e9f1ed 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -70,7 +70,7 @@ ${project.basedir}/../../examples 2.6.4 - 2.2.0 + 2.2.1 From b2570d1ca552e77413bf08fa3d53d049d2792d3f Mon Sep 17 00:00:00 2001 From: amrk7s Date: Wed, 27 Dec 2017 19:39:40 +0530 Subject: [PATCH 17/21] FDPIN-2699 Adding fk dists to publish orc libs to artifactory --- java/pom.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/java/pom.xml b/java/pom.xml index bf15e9f1ed..60bdf50348 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -73,6 +73,19 @@ 2.2.1 + + + fk-art-snapshot + libs-snapshot + http://10.85.59.116/artifactory/v1.0/artifacts/libs-snapshots-local + + + fk-art-release + libs-rel + http://10.85.59.116/artifactory/v1.0/artifacts/libs-release-local + + + From 643773fdd3d43c4029e1513d2538fe1f41790980 Mon Sep 17 00:00:00 2001 From: amrk7s Date: Thu, 28 Dec 2017 13:38:54 +0530 Subject: [PATCH 18/21] ORC-264 Backporting case insensitive schema evolution changes --- java/core/pom.xml | 2 +- .../core/src/java/org/apache/orc/OrcConf.java | 4 ++- java/core/src/java/org/apache/orc/Reader.java | 18 ++++++++++ .../org/apache/orc/impl/SchemaEvolution.java | 33 +++++++++++++++++-- .../apache/orc/impl/TestSchemaEvolution.java | 30 +++++++++++++++++ java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++--- java/tools/pom.xml | 2 +- 8 files changed, 88 insertions(+), 11 deletions(-) diff --git a/java/core/pom.xml b/java/core/pom.xml index 6b29581073..ba05fe8a68 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3 + 1.3.3.fk.2 ../pom.xml diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java index 5eb2aa3a3e..aede0c88a8 100644 --- a/java/core/src/java/org/apache/orc/OrcConf.java +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -142,7 +142,9 @@ public enum OrcConf { "orc.force.positional.evolution", false, "Require schema evolution to match the top level columns using position\n" + "rather than column names. This provides backwards compatibility with\n" + - "Hive 2.1.") + "Hive 2.1."), + IS_SCHEMA_EVOLUTION_CASE_SENSITIVE("orc.schema.evolution.case.sensitive", "orc.schema.evolution.case.sensitive", true, + "A boolean flag to determine if the comparision of field names in schema evolution is case sensitive .\n") ; private final String attribute; diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index 2ef64d72a8..d4e6b56863 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -160,6 +160,7 @@ public static class Options implements Cloneable { private DataReader dataReader = null; private Boolean tolerateMissingSchema = null; private boolean forcePositionalEvolution; + private boolean isSchemaEvolutionCaseAware; public Options() { // PASS @@ -170,6 +171,8 @@ public Options(Configuration conf) { skipCorruptRecords = OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf); tolerateMissingSchema = OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf); forcePositionalEvolution = OrcConf.FORCE_POSITIONAL_EVOLUTION.getBoolean(conf); + isSchemaEvolutionCaseAware = + OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.getBoolean(conf); } /** @@ -262,6 +265,17 @@ public Options forcePositionalEvolution(boolean value) { return this; } + /** + * Set boolean flag to determine if the comparision of field names in schema + * evolution is case sensitive + * @param value the flag for schema evolution is case sensitive or not. + * @return + */ + public Options isSchemaEvolutionCaseAware(boolean value) { + this.isSchemaEvolutionCaseAware = value; + return this; + } + public boolean[] getInclude() { return include; } @@ -310,6 +324,10 @@ public boolean getForcePositionalEvolution() { return forcePositionalEvolution; } + public boolean getIsSchemaEvolutionCaseAware() { + return isSchemaEvolutionCaseAware; + } + public Options clone() { try { Options result = (Options) super.clone(); diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index 886d58539e..78acd41483 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -48,6 +48,7 @@ public class SchemaEvolution { private final TypeDescription readerSchema; private boolean hasConversion = false; private final boolean isAcid; + private final boolean isSchemaEvolutionCaseAware; // indexed by reader column id private final boolean[] ppdSafeConversion; @@ -57,6 +58,7 @@ public class SchemaEvolution { private static final Pattern missingMetadataPattern = Pattern.compile("_col\\d+"); + public static class IllegalEvolutionException extends RuntimeException { public IllegalEvolutionException(String msg) { super(msg); @@ -68,6 +70,7 @@ public SchemaEvolution(TypeDescription fileSchema, Reader.Options options) { boolean allowMissingMetadata = options.getTolerateMissingSchema(); boolean[] includedCols = options.getInclude(); + this.isSchemaEvolutionCaseAware=options.getIsSchemaEvolutionCaseAware(); this.readerIncluded = includedCols == null ? null : Arrays.copyOf(includedCols, includedCols.length); this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1]; @@ -398,13 +401,20 @@ void buildConversion(TypeDescription fileType, if (positionalLevels == 0) { List readerFieldNames = readerType.getFieldNames(); List fileFieldNames = fileType.getFieldNames(); - Map fileTypesIdx = new HashMap<>(); + + final Map fileTypesIdx; + if (isSchemaEvolutionCaseAware) { + fileTypesIdx = new HashMap<>(); + } else { + fileTypesIdx = new CaseInsensitiveMap(); + } for (int i = 0; i < fileFieldNames.size(); i++) { - fileTypesIdx.put(fileFieldNames.get(i), fileChildren.get(i)); + final String fileFieldName = fileFieldNames.get(i); + fileTypesIdx.put(fileFieldName, fileChildren.get(i)); } for (int i = 0; i < readerFieldNames.size(); i++) { - String readerFieldName = readerFieldNames.get(i); + final String readerFieldName = readerFieldNames.get(i); TypeDescription readerField = readerChildren.get(i); TypeDescription fileField = fileTypesIdx.get(readerFieldName); @@ -511,4 +521,21 @@ static TypeDescription getBaseRow(TypeDescription typeDescription) { acidEventFieldNames.add("currentTransaction"); acidEventFieldNames.add("row"); } + + private static class CaseInsensitiveMap extends HashMap { + @Override + public V put(String key, V value) { + return super.put(key.toLowerCase(), value); + } + + @Override + public V get(Object key) { + return this.get((String) key); + } + + // not @Override as key to be of type Object + public V get(String key) { + return super.get(key.toLowerCase()); + } + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java index 82f823aff9..510e00c93d 100644 --- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -46,6 +46,7 @@ import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; @@ -718,6 +719,35 @@ public void testAddFieldBeforeEndOfStruct() { original = fileType.getChildren().get(1); assertSame(original, mapped); } + @Test + public void testCaseMismatchInReaderAndWriterSchema() { + TypeDescription fileType = + TypeDescription.fromString("struct,c:string>"); + TypeDescription readerType = + TypeDescription.fromString("struct,c:string>"); + boolean[] included = includeAll(readerType); + options.tolerateMissingSchema(false); + SchemaEvolution transition = + new SchemaEvolution(fileType, readerType, options.include(included).isSchemaEvolutionCaseAware(false)); + + // a -> A + TypeDescription reader = readerType.getChildren().get(0); + TypeDescription mapped = transition.getFileType(reader); + TypeDescription original = fileType.getChildren().get(0); + assertSame(original, mapped); + + // a.b -> a.b + TypeDescription readerChild = reader.getChildren().get(0); + mapped = transition.getFileType(readerChild); + TypeDescription originalChild = original.getChildren().get(0); + assertSame(originalChild, mapped); + + // c -> c + reader = readerType.getChildren().get(1); + mapped = transition.getFileType(reader); + original = fileType.getChildren().get(1); + assertSame(original, mapped); + } /** * Two structs can be equal but in different locations. They can converge to this. diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 93aea43eb1..9278b58c3d 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3 + 1.3.3.fk.2 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 60bdf50348..40b08b89b5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.3 + 1.3.3.fk.2 pom Apache ORC @@ -253,17 +253,17 @@ org.apache.orc orc-core - 1.3.3 + 1.3.3.fk.2 org.apache.orc orc-mapreduce - 1.3.3 + 1.3.3.fk.2 org.apache.orc orc-tools - 1.3.3 + 1.3.3.fk.2 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 154b1a6569..6d20ccd1c1 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3 + 1.3.3.fk.2 ../pom.xml From 6ec520a555646bd21d1a1b6836970fdb0fed1565 Mon Sep 17 00:00:00 2001 From: piyush mukati Date: Tue, 2 Jan 2018 15:28:45 +0530 Subject: [PATCH 19/21] FDPIN-2699 ORC-286 Backport (#2) * backported ORC-286 * orc-286 version update --- java/core/pom.xml | 2 +- java/core/src/java/org/apache/orc/Reader.java | 2 +- java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 ++++---- java/tools/pom.xml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/java/core/pom.xml b/java/core/pom.xml index ba05fe8a68..a5b8226c6e 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.2 + 1.3.3.fk.3 ../pom.xml diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index d4e6b56863..d0182a58ab 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -160,7 +160,7 @@ public static class Options implements Cloneable { private DataReader dataReader = null; private Boolean tolerateMissingSchema = null; private boolean forcePositionalEvolution; - private boolean isSchemaEvolutionCaseAware; + private boolean isSchemaEvolutionCaseAware=(boolean) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.getDefaultValue(); public Options() { // PASS diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 9278b58c3d..773943917d 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.2 + 1.3.3.fk.3 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 40b08b89b5..32b72fdd88 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.3.fk.2 + 1.3.3.fk.3 pom Apache ORC @@ -253,17 +253,17 @@ org.apache.orc orc-core - 1.3.3.fk.2 + 1.3.3.fk.3 org.apache.orc orc-mapreduce - 1.3.3.fk.2 + 1.3.3.fk.3 org.apache.orc orc-tools - 1.3.3.fk.2 + 1.3.3.fk.3 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 6d20ccd1c1..657e6e3350 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.2 + 1.3.3.fk.3 ../pom.xml From 61dd798d0e7efb4b6367d08f590ab7055c739e3a Mon Sep 17 00:00:00 2001 From: Ashish Kumar Sharma Date: Thu, 4 Jan 2018 22:58:26 +0530 Subject: [PATCH 20/21] FDPIN-2690: ORC-285 - ORC double and float vector batch size fix (#3) This is a back-ported patch. Refer https://issues.apache.org/jira/browse/ORC-285 --- java/core/pom.xml | 2 +- .../apache/orc/impl/TreeReaderFactory.java | 125 +++++++++--------- .../org/apache/orc/TestVectorOrcFile.java | 38 ++++++ java/mapreduce/pom.xml | 2 +- java/pom.xml | 8 +- java/tools/pom.xml | 2 +- 6 files changed, 109 insertions(+), 68 deletions(-) diff --git a/java/core/pom.xml b/java/core/pom.xml index a5b8226c6e..e992da2e56 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.3 + 1.3.3.fk.4 ../pom.xml diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 4b369afe3a..9649be9dbe 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -640,40 +640,42 @@ public void nextVector(ColumnVector previousVector, final boolean hasNulls = !result.noNulls; boolean allNulls = hasNulls; - if (hasNulls) { - // conditions to ensure bounds checks skips - for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) { - allNulls = allNulls & result.isNull[i]; - } - if (allNulls) { - result.vector[0] = Double.NaN; - result.isRepeating = true; - } else { - // some nulls - result.isRepeating = false; + if (batchSize > 0) { + if (hasNulls) { // conditions to ensure bounds checks skips - for (int i = 0; batchSize <= result.isNull.length - && batchSize <= result.vector.length && i < batchSize; i++) { - if (!result.isNull[i]) { - result.vector[i] = utils.readFloat(stream); - } else { - // If the value is not present then set NaN - result.vector[i] = Double.NaN; + for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) { + allNulls = allNulls & result.isNull[i]; + } + if (allNulls) { + result.vector[0] = Double.NaN; + result.isRepeating = true; + } else { + // some nulls + result.isRepeating = false; + // conditions to ensure bounds checks skips + for (int i = 0; batchSize <= result.isNull.length + && batchSize <= result.vector.length && i < batchSize; i++) { + if (!result.isNull[i]) { + result.vector[i] = utils.readFloat(stream); + } else { + // If the value is not present then set NaN + result.vector[i] = Double.NaN; + } } } + } else { + // no nulls & > 1 row (check repeating) + boolean repeating = (batchSize > 1); + final float f1 = utils.readFloat(stream); + result.vector[0] = f1; + // conditions to ensure bounds checks skips + for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) { + final float f2 = utils.readFloat(stream); + repeating = repeating && (f1 == f2); + result.vector[i] = f2; + } + result.isRepeating = repeating; } - } else { - // no nulls & > 1 row (check repeating) - boolean repeating = (batchSize > 1); - final float f1 = utils.readFloat(stream); - result.vector[0] = f1; - // conditions to ensure bounds checks skips - for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) { - final float f2 = utils.readFloat(stream); - repeating = repeating && (f1 == f2); - result.vector[i] = f2; - } - result.isRepeating = repeating; } } @@ -733,41 +735,42 @@ public void nextVector(ColumnVector previousVector, final boolean hasNulls = !result.noNulls; boolean allNulls = hasNulls; - - if (hasNulls) { - // conditions to ensure bounds checks skips - for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) { - allNulls = allNulls & result.isNull[i]; - } - if (allNulls) { - result.vector[0] = Double.NaN; - result.isRepeating = true; - } else { - // some nulls - result.isRepeating = false; + if (batchSize != 0) { + if (hasNulls) { // conditions to ensure bounds checks skips - for (int i = 0; batchSize <= result.isNull.length - && batchSize <= result.vector.length && i < batchSize; i++) { - if (!result.isNull[i]) { - result.vector[i] = utils.readDouble(stream); - } else { - // If the value is not present then set NaN - result.vector[i] = Double.NaN; + for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) { + allNulls = allNulls & result.isNull[i]; + } + if (allNulls) { + result.vector[0] = Double.NaN; + result.isRepeating = true; + } else { + // some nulls + result.isRepeating = false; + // conditions to ensure bounds checks skips + for (int i = 0; batchSize <= result.isNull.length + && batchSize <= result.vector.length && i < batchSize; i++) { + if (!result.isNull[i]) { + result.vector[i] = utils.readDouble(stream); + } else { + // If the value is not present then set NaN + result.vector[i] = Double.NaN; + } } } + } else { + // no nulls + boolean repeating = (batchSize > 1); + final double d1 = utils.readDouble(stream); + result.vector[0] = d1; + // conditions to ensure bounds checks skips + for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) { + final double d2 = utils.readDouble(stream); + repeating = repeating && (d1 == d2); + result.vector[i] = d2; + } + result.isRepeating = repeating; } - } else { - // no nulls - boolean repeating = (batchSize > 1); - final double d1 = utils.readDouble(stream); - result.vector[0] = d1; - // conditions to ensure bounds checks skips - for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) { - final double d2 = utils.readDouble(stream); - repeating = repeating && (d1 == d2); - result.vector[i] = d2; - } - result.isRepeating = repeating; } } diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index b7fa8eee88..6cd3dedbeb 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -3129,4 +3129,42 @@ public void testMerge() throws Exception { assertEquals(fromString("baz"), reader.getMetadataValue("c")); assertEquals(fromString("bat"), reader.getMetadataValue("d")); } + + Path exampleDir = new Path(System.getProperty("example.dir", + "../../examples/")); + + @Test + public void testEmptyDoubleStream() throws Exception { + TypeDescription schema = + TypeDescription.fromString("struct," + + "list2:array>"); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 2; + ListColumnVector list1 = (ListColumnVector) batch.cols[0]; + ListColumnVector list2 = (ListColumnVector) batch.cols[1]; + for(int r=0; r < batch.size; ++r) { + list1.offsets[r] = 0; + list1.lengths[r] = 0; + list2.offsets[r] = 0; + list2.lengths[r] = 0; + } + writer.addRowBatch(batch); + writer.close(); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + assertTrue(rows.nextBatch(batch)); + assertEquals(2, batch.size); + list1 = (ListColumnVector) batch.cols[0]; + list2 = (ListColumnVector) batch.cols[1]; + for(int r=0; r < batch.size; ++r) { + assertEquals(0, list1.lengths[r]); + assertEquals(0, list2.lengths[r]); + } + assertFalse(rows.nextBatch(batch)); + rows.close(); + } } diff --git a/java/mapreduce/pom.xml b/java/mapreduce/pom.xml index 773943917d..44f58629d8 100644 --- a/java/mapreduce/pom.xml +++ b/java/mapreduce/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.3 + 1.3.3.fk.4 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index 32b72fdd88..a7bc85412f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.orc orc - 1.3.3.fk.3 + 1.3.3.fk.4 pom Apache ORC @@ -253,17 +253,17 @@ org.apache.orc orc-core - 1.3.3.fk.3 + 1.3.3.fk.4 org.apache.orc orc-mapreduce - 1.3.3.fk.3 + 1.3.3.fk.4 org.apache.orc orc-tools - 1.3.3.fk.3 + 1.3.3.fk.4 diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 657e6e3350..e6d453cf5b 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc - 1.3.3.fk.3 + 1.3.3.fk.4 ../pom.xml From f1784891f6bd058c2495c021ce1c89ca57eba112 Mon Sep 17 00:00:00 2001 From: amrk7s Date: Mon, 15 Jan 2018 14:47:06 +0530 Subject: [PATCH 21/21] HIVE-18325 Temporary fix for HIVE-18325, To be removed after HIVE-18325 is resolved --- java/core/src/java/org/apache/orc/Reader.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index d0182a58ab..6c8100e331 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -163,7 +163,7 @@ public static class Options implements Cloneable { private boolean isSchemaEvolutionCaseAware=(boolean) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.getDefaultValue(); public Options() { - // PASS + isSchemaEvolutionCaseAware = false; } public Options(Configuration conf) { @@ -171,8 +171,7 @@ public Options(Configuration conf) { skipCorruptRecords = OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf); tolerateMissingSchema = OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf); forcePositionalEvolution = OrcConf.FORCE_POSITIONAL_EVOLUTION.getBoolean(conf); - isSchemaEvolutionCaseAware = - OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.getBoolean(conf); + isSchemaEvolutionCaseAware = false; } /**