From 69c33fd7eb94d4786f41df6bdb17c1ba4c259065 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Fri, 6 Jan 2017 10:22:11 -0800 Subject: [PATCH] ORC-128. Add getStatistics to Writer API to allow user to get statistics as the file is written. Signed-off-by: Owen O'Malley --- java/core/src/java/org/apache/orc/Writer.java | 12 + .../apache/orc/impl/ColumnStatisticsImpl.java | 316 ++++++++++++++++++ .../java/org/apache/orc/impl/ReaderImpl.java | 6 +- .../java/org/apache/orc/impl/WriterImpl.java | 12 + .../apache/orc/TestOrcNullOptimization.java | 18 +- .../org/apache/orc/TestVectorOrcFile.java | 22 +- 6 files changed, 379 insertions(+), 7 deletions(-) diff --git a/java/core/src/java/org/apache/orc/Writer.java b/java/core/src/java/org/apache/orc/Writer.java index 4492062f31..596e14ed4f 100644 --- a/java/core/src/java/org/apache/orc/Writer.java +++ b/java/core/src/java/org/apache/orc/Writer.java @@ -111,4 +111,16 @@ public void appendStripe(byte[] stripe, int offset, int length, * @param userMetadata - user metadata */ public void appendUserMetadata(List userMetadata); + + /** + * Get the statistics about the columns in the file. The output of this is + * based on the time at which it is called. It shall use all of the currently + * written data to provide the statistics. + * + * Please note there are costs involved with invoking this method and should + * be used judiciously. + * + * @return the information about the column + */ + ColumnStatistics[] getStatistics() throws IOException; } diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 745ed9af34..7e1826a6e4 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -39,6 +39,34 @@ public class ColumnStatisticsImpl implements ColumnStatistics { + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ColumnStatisticsImpl)) { + return false; + } + + ColumnStatisticsImpl that = (ColumnStatisticsImpl) o; + + if (count != that.count) { + return false; + } + if (hasNull != that.hasNull) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = (int) (count ^ (count >>> 32)); + result = 31 * result + (hasNull ? 1 : 0); + return result; + } + private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl implements BooleanColumnStatistics { private long trueCount = 0; @@ -102,6 +130,34 @@ public long getTrueCount() { public String toString() { return super.toString() + " true: " + trueCount; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BooleanStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + BooleanStatisticsImpl that = (BooleanStatisticsImpl) o; + + if (trueCount != that.trueCount) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (int) (trueCount ^ (trueCount >>> 32)); + return result; + } } private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl @@ -247,6 +303,50 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof IntegerStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + IntegerStatisticsImpl that = (IntegerStatisticsImpl) o; + + if (minimum != that.minimum) { + return false; + } + if (maximum != that.maximum) { + return false; + } + if (sum != that.sum) { + return false; + } + if (hasMinimum != that.hasMinimum) { + return false; + } + if (overflow != that.overflow) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (int) (minimum ^ (minimum >>> 32)); + result = 31 * result + (int) (maximum ^ (maximum >>> 32)); + result = 31 * result + (int) (sum ^ (sum >>> 32)); + result = 31 * result + (hasMinimum ? 1 : 0); + result = 31 * result + (overflow ? 1 : 0); + return result; + } } private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl @@ -364,6 +464,50 @@ public String toString() { buf.append(sum); return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof DoubleStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + DoubleStatisticsImpl that = (DoubleStatisticsImpl) o; + + if (hasMinimum != that.hasMinimum) { + return false; + } + if (Double.compare(that.minimum, minimum) != 0) { + return false; + } + if (Double.compare(that.maximum, maximum) != 0) { + return false; + } + if (Double.compare(that.sum, sum) != 0) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + long temp; + result = 31 * result + (hasMinimum ? 1 : 0); + temp = Double.doubleToLongBits(minimum); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(maximum); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(sum); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + return result; + } } protected static final class StringStatisticsImpl extends ColumnStatisticsImpl @@ -498,6 +642,42 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof StringStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + StringStatisticsImpl that = (StringStatisticsImpl) o; + + if (sum != that.sum) { + return false; + } + if (minimum != null ? !minimum.equals(that.minimum) : that.minimum != null) { + return false; + } + if (maximum != null ? !maximum.equals(that.maximum) : that.maximum != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (minimum != null ? minimum.hashCode() : 0); + result = 31 * result + (maximum != null ? maximum.hashCode() : 0); + result = 31 * result + (int) (sum ^ (sum >>> 32)); + return result; + } } protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements @@ -569,6 +749,34 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BinaryStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + BinaryStatisticsImpl that = (BinaryStatisticsImpl) o; + + if (sum != that.sum) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (int) (sum ^ (sum >>> 32)); + return result; + } } private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl @@ -694,6 +902,42 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof DecimalStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + DecimalStatisticsImpl that = (DecimalStatisticsImpl) o; + + if (minimum != null ? !minimum.equals(that.minimum) : that.minimum != null) { + return false; + } + if (maximum != null ? !maximum.equals(that.maximum) : that.maximum != null) { + return false; + } + if (sum != null ? !sum.equals(that.sum) : that.sum != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (minimum != null ? minimum.hashCode() : 0); + result = 31 * result + (maximum != null ? maximum.hashCode() : 0); + result = 31 * result + (sum != null ? sum.hashCode() : 0); + return result; + } } private static final class DateStatisticsImpl extends ColumnStatisticsImpl @@ -815,6 +1059,46 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof DateStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + DateStatisticsImpl that = (DateStatisticsImpl) o; + + if (minimum != null ? !minimum.equals(that.minimum) : that.minimum != null) { + return false; + } + if (maximum != null ? !maximum.equals(that.maximum) : that.maximum != null) { + return false; + } + if (minDate != null ? !minDate.equals(that.minDate) : that.minDate != null) { + return false; + } + if (maxDate != null ? !maxDate.equals(that.maxDate) : that.maxDate != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (minimum != null ? minimum.hashCode() : 0); + result = 31 * result + (maximum != null ? maximum.hashCode() : 0); + result = 31 * result + (minDate != null ? minDate.hashCode() : 0); + result = 31 * result + (maxDate != null ? maxDate.hashCode() : 0); + return result; + } } private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl @@ -925,6 +1209,38 @@ public String toString() { } return buf.toString(); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof TimestampStatisticsImpl)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + TimestampStatisticsImpl that = (TimestampStatisticsImpl) o; + + if (minimum != null ? !minimum.equals(that.minimum) : that.minimum != null) { + return false; + } + if (maximum != null ? !maximum.equals(that.maximum) : that.maximum != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + (minimum != null ? minimum.hashCode() : 0); + result = 31 * result + (maximum != null ? maximum.hashCode() : 0); + return result; + } } private long count = 0; diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index ad3f8bad1d..c24920d62b 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -216,7 +216,11 @@ public int getRowIndexStride() { @Override public ColumnStatistics[] getStatistics() { - ColumnStatistics[] result = new ColumnStatistics[types.size()]; + return deserializeStats(fileStats); + } + + static ColumnStatistics[] deserializeStats(List fileStats){ + ColumnStatistics[] result = new ColumnStatistics[fileStats.size()]; for(int i=0; i < result.length; ++i) { result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i)); } diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 940ef593a4..d3ab8d03a4 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -37,6 +37,7 @@ import io.airlift.compress.lzo.LzoDecompressor; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.orc.BinaryColumnStatistics; +import org.apache.orc.ColumnStatistics; import org.apache.orc.util.BloomFilter; import org.apache.orc.util.BloomFilterIO; import org.apache.orc.CompressionCodec; @@ -3059,4 +3060,15 @@ public void appendUserMetadata(List userMetadata) { } } } + + @Override + public ColumnStatistics[] getStatistics() + throws IOException { + // Generate the stats + OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder(); + + // add the column statistics + writeFileStatistics(builder, treeWriter); + return ReaderImpl.deserializeStats(builder.getStatisticsList()); + } } diff --git a/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java b/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java index 0b605c9fdc..45b69b2a5c 100644 --- a/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java +++ b/java/core/src/test/org/apache/orc/TestOrcNullOptimization.java @@ -18,13 +18,15 @@ package org.apache.orc; import static junit.framework.Assert.assertEquals; +import static org.apache.orc.TestVectorOrcFile.assertEmptyStats; +import static org.junit.Assert.assertArrayEquals; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Random; -import junit.framework.Assert; +import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -233,9 +235,19 @@ public void testMultiStripeWithoutNull() throws Exception { .compress(CompressionKind.NONE) .bufferSize(10000)); Random rand = new Random(100); - VectorizedRowBatch batch = schema.createRowBatch(); + int batchSize = 5000; + VectorizedRowBatch batch = schema.createRowBatch(batchSize); + ColumnStatistics[] writerStats = writer.getStatistics(); + assertEmptyStats(writerStats); + int count = 0; for (int i = 1; i < 20000; i++) { addRow(writer, batch, rand.nextInt(1), "a", true, 100); + count++; + if (count % batchSize == 1) { + writerStats = writer.getStatistics(); + } else { + assertArrayEquals(writerStats, writer.getStatistics()); + } } addRow(writer, batch, 0, "b", true, 100); writer.addRowBatch(batch); @@ -245,6 +257,7 @@ public void testMultiStripeWithoutNull() throws Exception { OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); + assertArrayEquals(stats, writer.getStatistics()); assertEquals(20000, reader.getNumberOfRows()); assertEquals(20000, stats[0].getNumberOfValues()); @@ -338,6 +351,7 @@ public void testColumnsWithNullAndCompression() throws Exception { OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); + assertArrayEquals(stats, writer.getStatistics()); assertEquals(8, reader.getNumberOfRows()); assertEquals(8, stats[0].getNumberOfValues()); diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index af31b4d0bd..2448cb7d73 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -20,7 +20,7 @@ import com.google.common.collect.Lists; -import junit.framework.Assert; +import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -69,6 +69,7 @@ import static junit.framework.TestCase.assertNotNull; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -438,6 +439,7 @@ public void testStringAndBinaryStatistics() throws Exception { // check the stats ColumnStatistics[] stats = reader.getStatistics(); + assertArrayEquals(stats, writer.getStatistics()); assertEquals(4, stats[0].getNumberOfValues()); assertEquals("count: 4 hasNull: false", stats[0].toString()); @@ -914,7 +916,7 @@ private static TypeDescription createBigRowSchema() { createInnerSchema())); } - static void assertArrayEquals(boolean[] expected, boolean[] actual) { + static void assertArrayBooleanEquals(boolean[] expected, boolean[] actual) { assertEquals(expected.length, actual.length); boolean diff = false; for(int i=0; i < expected.length; ++i) { @@ -935,6 +937,7 @@ public void test1() throws Exception { .setSchema(schema) .stripeSize(100000) .bufferSize(10000)); + assertEmptyStats(writer.getStatistics()); VectorizedRowBatch batch = schema.createRowBatch(); batch.size = 2; setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536, @@ -948,7 +951,9 @@ public void test1() throws Exception { list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib"))); writer.addRowBatch(batch); + assertEmptyStats(writer.getStatistics()); writer.close(); + ColumnStatistics[] closeStatistics = writer.getStatistics(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); @@ -969,7 +974,7 @@ public void test1() throws Exception { true, true, true, true}; included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema); - assertArrayEquals(expected, included); + assertArrayBooleanEquals(expected, included); expected = new boolean[] {false, true, false, false, false, false, false, false, false, true, @@ -977,7 +982,7 @@ public void test1() throws Exception { false, false, false, false, true, true, true, true, true}; included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema); - assertArrayEquals(expected, included); + assertArrayBooleanEquals(expected, included); expected = new boolean[] {false, true, true, true, true, true, true, true, true, true, @@ -991,6 +996,7 @@ public void test1() throws Exception { // check the stats ColumnStatistics[] stats = reader.getStatistics(); + assertArrayEquals(stats, closeStatistics); assertEquals(2, stats[1].getNumberOfValues()); assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount()); assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount()); @@ -1128,6 +1134,13 @@ public void test1() throws Exception { rows.close(); } + static void assertEmptyStats(ColumnStatistics[] writerStatistics) { + for (ColumnStatistics columnStatistics : writerStatistics){ + assertEquals(0, columnStatistics.getNumberOfValues()); + assertFalse(columnStatistics.hasNull()); + } + } + @Test public void testColumnProjection() throws Exception { TypeDescription schema = createInnerSchema(); @@ -2366,6 +2379,7 @@ public void testRepeating() throws Exception { // check the stats ColumnStatistics[] stats = reader.getStatistics(); + assertArrayEquals(stats, writer.getStatistics()); assertEquals(4096, stats[0].getNumberOfValues()); assertEquals(false, stats[0].hasNull()); for(TypeDescription colType: schema.getChildren()) {