From 6f924203c27f36a75dc9dcbb3982dfe8eb858657 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 26 Apr 2017 22:39:25 -0700 Subject: [PATCH 1/4] Refactor common classes for writer and reader This is mainly a refactoring change for ORC-176, including: 1. Extracted common classes and functions into Common.hh and Common.cc; 2. Put InputStream interface and its implementations from Compression.hh to Stream.hh; 3. Refactored ColumnStatistic classes to reuse code as many as possible and added setters for writer; 4. Added more functions in Int128 class for decimal operations; 5. Added buildTypeFromString() method in Type class to construct ORC type. --- c++/include/orc/Common.hh | 167 +++++++ c++/include/orc/Int128.hh | 109 ++++- c++/include/orc/Reader.hh | 137 +----- c++/include/orc/Statistics.hh | 284 ++++++++++- c++/include/orc/Type.hh | 5 + c++/src/CMakeLists.txt | 2 + c++/src/Common.cc | 107 ++++ c++/src/Compression.cc | 195 -------- c++/src/Compression.hh | 92 +--- c++/src/Int128.cc | 72 +++ c++/src/Reader.cc | 82 ---- c++/src/Statistics.cc | 153 ++++-- c++/src/Statistics.hh | 892 ++++++++++++++++++++++++++++------ c++/src/Stream.cc | 222 +++++++++ c++/src/Stream.hh | 116 +++++ c++/src/TypeImpl.cc | 186 +++++++ c++/src/TypeImpl.hh | 5 + 17 files changed, 2130 insertions(+), 696 deletions(-) create mode 100644 c++/include/orc/Common.hh create mode 100644 c++/src/Common.cc create mode 100644 c++/src/Stream.cc create mode 100644 c++/src/Stream.hh diff --git a/c++/include/orc/Common.hh b/c++/include/orc/Common.hh new file mode 100644 index 0000000000..f499b81f78 --- /dev/null +++ b/c++/include/orc/Common.hh @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COMMON_HH +#define ORC_COMMON_HH + +#include "orc/Vector.hh" +#include "orc/Type.hh" +#include "Exceptions.hh" +#include "wrap/orc-proto-wrapper.hh" + +#include + +namespace orc { + enum CompressionKind { + CompressionKind_NONE = 0, + CompressionKind_ZLIB = 1, + CompressionKind_SNAPPY = 2, + CompressionKind_LZO = 3, + CompressionKind_LZ4 = 4, + CompressionKind_ZSTD = 5, + CompressionKind_MAX = INT64_MAX + }; + + /** + * Get the name of the CompressionKind. + */ + std::string compressionKindToString(CompressionKind kind); + + enum WriterVersion { + WriterVersion_ORIGINAL = 0, + WriterVersion_HIVE_8732 = 1, + WriterVersion_HIVE_4243 = 2, + WriterVersion_HIVE_12055 = 3, + WriterVersion_HIVE_13083 = 4, + WriterVersion_ORC_101 = 5, + WriterVersion_ORC_135 = 6, + WriterVersion_MAX = INT64_MAX + }; + + /** + * Get the name of the WriterVersion. + */ + std::string writerVersionToString(WriterVersion kind); + + enum StreamKind { + StreamKind_PRESENT = 0, + StreamKind_DATA = 1, + StreamKind_LENGTH = 2, + StreamKind_DICTIONARY_DATA = 3, + StreamKind_DICTIONARY_COUNT = 4, + StreamKind_SECONDARY = 5, + StreamKind_ROW_INDEX = 6, + StreamKind_BLOOM_FILTER = 7 + }; + + /** + * Get the string representation of the StreamKind. + */ + std::string streamKindToString(StreamKind kind); + + class StreamInformation { + public: + virtual ~StreamInformation(); + + virtual StreamKind getKind() const = 0; + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getOffset() const = 0; + virtual uint64_t getLength() const = 0; + }; + + enum ColumnEncodingKind { + ColumnEncodingKind_DIRECT = 0, + ColumnEncodingKind_DICTIONARY = 1, + ColumnEncodingKind_DIRECT_V2 = 2, + ColumnEncodingKind_DICTIONARY_V2 = 3 + }; + + std::string columnEncodingKindToString(ColumnEncodingKind kind); + + class StripeInformation { + public: + virtual ~StripeInformation(); + + /** + * Get the byte offset of the start of the stripe. + * @return the bytes from the start of the file + */ + virtual uint64_t getOffset() const = 0; + + /** + * Get the total length of the stripe in bytes. + * @return the number of bytes in the stripe + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the length of the stripe's indexes. + * @return the number of bytes in the index + */ + virtual uint64_t getIndexLength() const = 0; + + /** + * Get the length of the stripe's data. + * @return the number of bytes in the stripe + */ + virtual uint64_t getDataLength()const = 0; + + /** + * Get the length of the stripe's tail section, which contains its index. + * @return the number of bytes in the tail + */ + virtual uint64_t getFooterLength() const = 0; + + /** + * Get the number of rows in the stripe. + * @return a count of the number of rows + */ + virtual uint64_t getNumberOfRows() const = 0; + + /** + * Get the number of streams in the stripe. + */ + virtual uint64_t getNumberOfStreams() const = 0; + + /** + * Get the StreamInformation for the given stream. + */ + virtual ORC_UNIQUE_PTR + getStreamInformation(uint64_t streamId) const = 0; + + /** + * Get the column encoding for the given column. + * @param colId the columnId + */ + virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; + + /** + * Get the dictionary size. + * @param colId the columnId + * @return the size of the dictionary or 0 if there isn't one + */ + virtual uint64_t getDictionarySize(uint64_t colId) const = 0; + + /** + * Get the writer timezone. + */ + virtual const std::string& getWriterTimezone() const = 0; + }; +} + +#endif diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh index 70793dcdd6..023f2a21d6 100644 --- a/c++/include/orc/Int128.hh +++ b/c++/include/orc/Int128.hh @@ -150,6 +150,26 @@ namespace orc { */ Int128 divide(const Int128 &right, Int128& remainder) const; + /** + * Divide this number by right and return the integral part. This operation + * is destructive. + * @param right the number to divide by + */ + Int128& operator/=(const Int128 &right) { + *this = operator/(right); + return *this; + } + + /** + * Divide this number by right and return the integral part. This operation + * is not destructive. + * @param right the number to divide by + */ + Int128 operator/(const Int128 &right) const { + Int128 remainder; + return divide(right, remainder); + } + /** * Logical or between two Int128. * @param right the number to or in @@ -161,10 +181,20 @@ namespace orc { return *this; } + /** + * Logical or between two Int128. + * @param right the number to or in + * @return result + */ + Int128 operator|(const Int128 right) { + Int128 value = *this; + value |= right; + return value; + } + /** * Logical and between two Int128. * @param right the number to and in - * @return *this */ Int128& operator&=(const Int128 &right) { lowbits &= right.lowbits; @@ -172,6 +202,16 @@ namespace orc { return *this; } + /** + * Logical and between two Int128. + * @param right the number to and in + */ + Int128 operator&(const Int128 &right) { + Int128 value = *this; + value &= right; + return value; + } + /** * Shift left by the given number of bits. * Values larger than 2**127 will shift into the sign bit. @@ -193,6 +233,16 @@ namespace orc { return *this; } + /** + * Shift left by the given number of bits. + * Values larger than 2**127 will shift into the sign bit. + */ + Int128 operator<<(uint32_t bits) { + Int128 value = *this; + value <<= bits; + return value; + } + /** * Shift right by the given number of bits. Negative values will * sign extend and fill with one bits. @@ -215,6 +265,16 @@ namespace orc { return *this; } + /** + * Shift right by the given number of bits. Negative values will + * sign extend and fill with one bits. + */ + Int128 operator>>(uint32_t bits) { + Int128 value = *this; + value >>= bits; + return value; + } + bool operator==(const Int128& right) const { return highbits == right.highbits && lowbits == right.lowbits; } @@ -332,5 +392,52 @@ namespace orc { int64_t highbits; uint64_t lowbits; }; + + /** + * Scales an Int128 value + * @param value the Int128 value to scale + * @param scaleMultiplier the scale offset. Result of a negative + * scaleMultiplier is undefined. + * @param overflow returns whether the result overflows or not + * @return the scaled value + */ + Int128 scaleInt128(Int128 value, + int32_t scaleMultiplier, + bool &overflow); + + /** + * Compares two decimals + * @param lValue the integer representation of left decimal + * @param lScale the scale of left decimal + * @param rValue the integer representation of right decimal + * @param rScale the scale of right decimal + * @return -1 if left decimal is smaller, or + * 1 if right decimal is smaller, or + * 0 if they are equal + */ + int32_t decimalCompare(Int128 lValue, int32_t lScale, + Int128 rValue, int32_t rScale); + + const int32_t MAX_PRECISION_64 = 18; + const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = + {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; } #endif diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 76d7853451..3912bd77a1 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -19,6 +19,7 @@ #ifndef ORC_READER_HH #define ORC_READER_HH +#include "orc/Common.hh" #include "orc/orc-config.hh" #include "orc/Statistics.hh" #include "orc/Type.hh" @@ -34,142 +35,6 @@ namespace orc { struct ReaderOptionsPrivate; struct RowReaderOptionsPrivate; - enum CompressionKind { - CompressionKind_NONE = 0, - CompressionKind_ZLIB = 1, - CompressionKind_SNAPPY = 2, - CompressionKind_LZO = 3, - CompressionKind_LZ4 = 4, - CompressionKind_ZSTD = 5, - CompressionKind_MAX = INT64_MAX - }; - - /** - * Get the name of the CompressionKind. - */ - std::string compressionKindToString(CompressionKind kind); - - enum WriterVersion { - WriterVersion_ORIGINAL = 0, - WriterVersion_HIVE_8732 = 1, - WriterVersion_HIVE_4243 = 2, - WriterVersion_HIVE_12055 = 3, - WriterVersion_HIVE_13083 = 4, - WriterVersion_ORC_101 = 5, - WriterVersion_ORC_135 = 6, - WriterVersion_MAX = INT64_MAX - }; - - /** - * Get the name of the WriterVersion. - */ - std::string writerVersionToString(WriterVersion kind); - - enum StreamKind { - StreamKind_PRESENT = 0, - StreamKind_DATA = 1, - StreamKind_LENGTH = 2, - StreamKind_DICTIONARY_DATA = 3, - StreamKind_DICTIONARY_COUNT = 4, - StreamKind_SECONDARY = 5, - StreamKind_ROW_INDEX = 6, - StreamKind_BLOOM_FILTER = 7 - }; - - /** - * Get the string representation of the StreamKind. - */ - std::string streamKindToString(StreamKind kind); - - class StreamInformation { - public: - virtual ~StreamInformation(); - - virtual StreamKind getKind() const = 0; - virtual uint64_t getColumnId() const = 0; - virtual uint64_t getOffset() const = 0; - virtual uint64_t getLength() const = 0; - }; - - enum ColumnEncodingKind { - ColumnEncodingKind_DIRECT = 0, - ColumnEncodingKind_DICTIONARY = 1, - ColumnEncodingKind_DIRECT_V2 = 2, - ColumnEncodingKind_DICTIONARY_V2 = 3 - }; - - std::string columnEncodingKindToString(ColumnEncodingKind kind); - - class StripeInformation { - public: - virtual ~StripeInformation(); - - /** - * Get the byte offset of the start of the stripe. - * @return the bytes from the start of the file - */ - virtual uint64_t getOffset() const = 0; - - /** - * Get the total length of the stripe in bytes. - * @return the number of bytes in the stripe - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the length of the stripe's indexes. - * @return the number of bytes in the index - */ - virtual uint64_t getIndexLength() const = 0; - - /** - * Get the length of the stripe's data. - * @return the number of bytes in the stripe - */ - virtual uint64_t getDataLength()const = 0; - - /** - * Get the length of the stripe's tail section, which contains its index. - * @return the number of bytes in the tail - */ - virtual uint64_t getFooterLength() const = 0; - - /** - * Get the number of rows in the stripe. - * @return a count of the number of rows - */ - virtual uint64_t getNumberOfRows() const = 0; - - /** - * Get the number of streams in the stripe. - */ - virtual uint64_t getNumberOfStreams() const = 0; - - /** - * Get the StreamInformation for the given stream. - */ - virtual ORC_UNIQUE_PTR - getStreamInformation(uint64_t streamId) const = 0; - - /** - * Get the column encoding for the given column. - * @param colId the columnId - */ - virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; - - /** - * Get the dictionary size. - * @param colId the columnId - * @return the size of the dictionary or 0 if there isn't one - */ - virtual uint64_t getDictionarySize(uint64_t colId) const = 0; - - /** - * Get the writer timezone. - */ - virtual const std::string& getWriterTimezone() const = 0; - }; - /** * Options for creating a Reader. */ diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index c65eb3ce7d..489838f252 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -37,12 +37,76 @@ namespace orc { * of rows because of NULL values and repeated values. * @return the number of values */ - virtual uint64_t getNumberOfValues() const = 0; + virtual uint64_t getNumberOfValues() const { + return valueCount; + } + + /** + * Set the number of values in this column + * @param newValueCount new number of values to be set + */ + virtual void setNumberOfValues(uint64_t newValueCount) { + valueCount = newValueCount; + } + + /** + * Check whether column has null value + * @return true if has null value + */ + virtual bool hasNull() const { + return hasNullValue; + } + + /** + * Set whether column has null value + * @param newHasNull has null value + */ + virtual void setHasNull(bool newHasNull) { + hasNullValue = newHasNull; + } /** * print out statistics of column if any */ virtual std::string toString() const = 0; + + /** + * Increases count of values + * @param count number of values to be increased + */ + virtual void increase(uint64_t count) { + valueCount += count; + } + + /** + * reset column statistics to initial state + */ + virtual void reset() { + hasNullValue = false; + valueCount = 0; + } + + /** + * Merges another statistics + * @param other statistics to be merged + */ + virtual void merge(const ColumnStatistics& other) { + hasNullValue |= other.hasNull(); + valueCount += other.getNumberOfValues(); + } + + /** + * Convert statistics to protobuf version + * @param pbStats output of protobuf stats + */ + virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const { + pbStats.set_hasnull(hasNullValue); + pbStats.set_numberofvalues(valueCount); + } + + protected: + uint64_t valueCount; + bool hasNullValue; }; /** @@ -58,7 +122,30 @@ namespace orc { */ virtual bool hasTotalLength() const = 0; + /** + * set has total length + * @param newHasTotalLength has total length + */ + virtual void setHasTotalLength(bool newHasTotalLength) = 0; + + /** + * get total length + * @return total length + */ virtual uint64_t getTotalLength() const = 0; + + /** + * set total length + * @param newTotalLength new total length value + */ + virtual void setTotalLength(uint64_t newTotalLength) = 0; + + /** + * update stats by a new value + * @param value new value to update + * @param length length of the value + */ + virtual void update(const char* value, size_t length) = 0; }; /** @@ -74,8 +161,23 @@ namespace orc { */ virtual bool hasCount() const = 0; + /** + * set hasCount value + * @param hasCount new hasCount value + */ + virtual void setHasCount(bool hasCount) = 0; + virtual uint64_t getFalseCount() const = 0; virtual uint64_t getTrueCount() const = 0; + + virtual void setTrueCount(uint64_t trueCount) = 0; + + /** + * update stats by a new value + * @param value new value to update + * @param repetitions the repetitions of the boolean value + */ + virtual void update(bool value, size_t repetitions) = 0; }; /** @@ -103,11 +205,29 @@ namespace orc { */ virtual int32_t getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(int32_t min) = 0; + /** * Get the maximum value for the column. * @return maximum value */ virtual int32_t getMaximum() const = 0; + + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(int32_t max) = 0; + + /** + * update stats by a new value + * @param value new value to update + */ + virtual void update(int32_t value) = 0; }; /** @@ -135,23 +255,67 @@ namespace orc { */ virtual bool hasSum() const = 0; + /** + * set hasSum value + * @param newHasSum hasSum value + */ + virtual void setHasSum(bool newHasSum) = 0; + /** * Get the minimum value for the column. * @return minimum value */ virtual Decimal getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(Decimal min) = 0; + /** * Get the maximum value for the column. * @return maximum value */ virtual Decimal getMaximum() const = 0; + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(Decimal max) = 0; + /** * Get the sum for the column. * @return sum of all the values */ virtual Decimal getSum() const = 0; + + /** + * set new sum + * @param newSum sum to be set + */ + virtual void setSum(Decimal newSum) = 0; + + /** + * update stats by a new value + * @param value new value to update + */ + virtual void update(const Decimal& value) = 0; + + /** + * update stats by a new value + * @param value new decimal value represented in Int128 + * @param scale scale of the decimal + */ + virtual void update(Int128 value, int32_t scale) = 0; + + /** + * update stats by a new value + * @param value new decimal value represented in int64_t + * @param scale scale of the decimal + */ + virtual void update(int64_t value, int32_t scale) = 0; }; /** @@ -179,6 +343,12 @@ namespace orc { */ virtual bool hasSum() const = 0; + /** + * set hasSum value + * @param newHasSum hasSum value + */ + virtual void setHasSum(bool newHasSum) = 0; + /** * Get the smallest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -186,6 +356,12 @@ namespace orc { */ virtual double getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(double min) = 0; + /** * Get the largest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -193,11 +369,29 @@ namespace orc { */ virtual double getMaximum() const = 0; + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(double max) = 0; + /** * Get the sum of the values in the column. * @return the sum */ virtual double getSum() const = 0; + + /** + * set new sum + * @param newSum sum to be set + */ + virtual void setSum(double newSum) = 0; + + /** + * update stats by a new value + * @param value new value to update + */ + virtual void update(double value) = 0; }; /** @@ -226,6 +420,12 @@ namespace orc { */ virtual bool hasSum() const = 0; + /** + * set hasSum value + * @param newHasSum hasSum value + */ + virtual void setHasSum(bool newHasSum) = 0; + /** * Get the smallest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -233,6 +433,12 @@ namespace orc { */ virtual int64_t getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(int64_t min) = 0; + /** * Get the largest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -240,11 +446,30 @@ namespace orc { */ virtual int64_t getMaximum() const = 0; + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(int64_t max) = 0; + /** * Get the sum of the column. Only valid if isSumDefined returns true. * @return the sum of the column */ virtual int64_t getSum() const = 0; + + /** + * set new sum + * @param newSum sum to be set + */ + virtual void setSum(int64_t newSum) = 0; + + /** + * update stats by a new value + * @param value new value to update + * @param repetitions repetition of the value + */ + virtual void update(int64_t value, int repetitions) = 0; }; /** @@ -272,23 +497,60 @@ namespace orc { */ virtual bool hasTotalLength() const = 0; + /** + * set has total length + * @param newHasTotalLength has total length + */ + virtual void setHasTotalLength(bool newHasTotalLength) = 0; + /** * Get the minimum value for the column. * @return minimum value */ virtual std::string getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(std::string min) = 0; + /** * Get the maximum value for the column. * @return maximum value */ virtual std::string getMaximum() const = 0; + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(std::string max) = 0; + /** * Get the total length of all values. * @return total length of all the values */ virtual uint64_t getTotalLength() const = 0; + + /** + * set total length + * @param newTotalLength new total length value + */ + virtual void setTotalLength(uint64_t newTotalLength) = 0; + + /** + * update stats by a new value + * @param value new value to update + */ + virtual void update(const std::string& value) = 0; + + /** + * update stats by a new value + * @param value new value to update + * @param length length of the value + */ + virtual void update(const char* value, size_t length) = 0; }; /** @@ -316,11 +578,29 @@ namespace orc { */ virtual int64_t getMinimum() const = 0; + /** + * set new minimum value + * @param min new minimum value + */ + virtual void setMinimum(int64_t min) = 0; + /** * Get the maximum value for the column. * @return maximum value */ virtual int64_t getMaximum() const = 0; + + /** + * set new maximum value + * @param max new maximum value + */ + virtual void setMaximum(int64_t max) = 0; + + /** + * update stats by a new value + * @param value new value to update + */ + virtual void update(int64_t value) = 0; }; class Statistics { @@ -341,6 +621,8 @@ namespace orc { virtual uint32_t getNumberOfColumns() const = 0; }; + std::unique_ptr createColumnStatistics( + const Type& type, bool enableStringComparison); } diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 25b8f535c3..32cbbed99e 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -82,6 +82,11 @@ namespace orc { * @return a reference to the union type */ virtual Type* addUnionChild(ORC_UNIQUE_PTR fieldType) = 0; + + /** + * Build a Type object from string text representation. + */ + static Type* buildTypeFromString(const std::string& input); }; const int64_t DEFAULT_DECIMAL_SCALE = 18; diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index d717fb4d8e..51bdf1b5dc 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -139,6 +139,7 @@ add_library (orc STATIC ByteRLE.cc ColumnPrinter.cc ColumnReader.cc + Common.cc Compression.cc Exceptions.cc Int128.cc @@ -150,6 +151,7 @@ add_library (orc STATIC RLEv2.cc RLE.cc Statistics.cc + Stream.cc StripeStream.cc Timezone.cc TypeImpl.cc diff --git a/c++/src/Common.cc b/c++/src/Common.cc new file mode 100644 index 0000000000..7813612933 --- /dev/null +++ b/c++/src/Common.cc @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Common.hh" + +#include + +namespace orc { + + std::string compressionKindToString(CompressionKind kind) { + switch (static_cast(kind)) { + case CompressionKind_NONE: + return "none"; + case CompressionKind_ZLIB: + return "zlib"; + case CompressionKind_SNAPPY: + return "snappy"; + case CompressionKind_LZO: + return "lzo"; + case CompressionKind_LZ4: + return "lz4"; + case CompressionKind_ZSTD: + return "zstd"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string writerVersionToString(WriterVersion version) { + switch (static_cast(version)) { + case WriterVersion_ORIGINAL: + return "original"; + case WriterVersion_HIVE_8732: + return "HIVE-8732"; + case WriterVersion_HIVE_4243: + return "HIVE-4243"; + case WriterVersion_HIVE_12055: + return "HIVE-12055"; + case WriterVersion_HIVE_13083: + return "HIVE-13083"; + case WriterVersion_ORC_101: + return "ORC-101"; + case WriterVersion_ORC_135: + return "ORC-135"; + } + std::stringstream buffer; + buffer << "future - " << version; + return buffer.str(); + } + + std::string streamKindToString(StreamKind kind) { + switch (static_cast(kind)) { + case StreamKind_PRESENT: + return "present"; + case StreamKind_DATA: + return "data"; + case StreamKind_LENGTH: + return "length"; + case StreamKind_DICTIONARY_DATA: + return "dictionary"; + case StreamKind_DICTIONARY_COUNT: + return "dictionary count"; + case StreamKind_SECONDARY: + return "secondary"; + case StreamKind_ROW_INDEX: + return "index"; + case StreamKind_BLOOM_FILTER: + return "bloom"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string columnEncodingKindToString(ColumnEncodingKind kind) { + switch (static_cast(kind)) { + case ColumnEncodingKind_DIRECT: + return "direct"; + case ColumnEncodingKind_DICTIONARY: + return "dictionary"; + case ColumnEncodingKind_DIRECT_V2: + return "direct rle2"; + case ColumnEncodingKind_DICTIONARY_V2: + return "dictionary rle2"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + +} diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc index 81cc578550..e2f9dbf4d5 100644 --- a/c++/src/Compression.cc +++ b/c++/src/Compression.cc @@ -33,201 +33,6 @@ namespace orc { - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length) { - const uint64_t width = 24; - out << std::hex; - for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { - out << std::setfill('0') << std::setw(7) << (line * width); - for(uint64_t byte = 0; - byte < width && line * width + byte < length; ++byte) { - out << " " << std::setfill('0') << std::setw(2) - << static_cast(0xff & buffer[line * width + - byte]); - } - out << "\n"; - } - out << std::dec; - } - - PositionProvider::PositionProvider(const std::list& posns) { - position = posns.begin(); - } - - uint64_t PositionProvider::next() { - uint64_t result = *position; - ++position; - return result; - } - - SeekableInputStream::~SeekableInputStream() { - // PASS - } - - SeekableArrayInputStream::~SeekableArrayInputStream() { - // PASS - } - - SeekableArrayInputStream::SeekableArrayInputStream - (const unsigned char* values, - uint64_t size, - uint64_t blkSize - ): data(reinterpret_cast(values)) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast(blkSize); - } - - SeekableArrayInputStream::SeekableArrayInputStream(const char* values, - uint64_t size, - uint64_t blkSize - ): data(values) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast(blkSize); - } - - bool SeekableArrayInputStream::Next(const void** buffer, int*size) { - uint64_t currentSize = std::min(length - position, blockSize); - if (currentSize > 0) { - *buffer = data + position; - *size = static_cast(currentSize); - position += currentSize; - return true; - } - *size = 0; - return false; - } - - void SeekableArrayInputStream::BackUp(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast(count); - if (unsignedCount <= blockSize && unsignedCount <= position) { - position -= unsignedCount; - } else { - throw std::logic_error("Can't backup that much!"); - } - } - } - - bool SeekableArrayInputStream::Skip(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast(count); - if (unsignedCount + position <= length) { - position += unsignedCount; - return true; - } else { - position = length; - } - } - return false; - } - - google::protobuf::int64 SeekableArrayInputStream::ByteCount() const { - return static_cast(position); - } - - void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { - position = seekPosition.next(); - } - - std::string SeekableArrayInputStream::getName() const { - std::ostringstream result; - result << "SeekableArrayInputStream " << position << " of " << length; - return result.str(); - } - - static uint64_t computeBlock(uint64_t request, uint64_t length) { - return std::min(length, request == 0 ? 256 * 1024 : request); - } - - SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, - uint64_t offset, - uint64_t byteCount, - MemoryPool& _pool, - uint64_t _blockSize - ):pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock - (_blockSize, - length)) { - - position = 0; - buffer.reset(new DataBuffer(pool)); - pushBack = 0; - } - - SeekableFileInputStream::~SeekableFileInputStream() { - // PASS - } - - bool SeekableFileInputStream::Next(const void** data, int*size) { - uint64_t bytesRead; - if (pushBack != 0) { - *data = buffer->data() + (buffer->size() - pushBack); - bytesRead = pushBack; - } else { - bytesRead = std::min(length - position, blockSize); - buffer->resize(bytesRead); - if (bytesRead > 0) { - input->read(buffer->data(), bytesRead, start+position); - *data = static_cast(buffer->data()); - } - } - position += bytesRead; - pushBack = 0; - *size = static_cast(bytesRead); - return bytesRead != 0; - } - - void SeekableFileInputStream::BackUp(int signedCount) { - if (signedCount < 0) { - throw std::logic_error("can't backup negative distances"); - } - uint64_t count = static_cast(signedCount); - if (pushBack > 0) { - throw std::logic_error("can't backup unless we just called Next"); - } - if (count > blockSize || count > position) { - throw std::logic_error("can't backup that far"); - } - pushBack = static_cast(count); - position -= pushBack; - } - - bool SeekableFileInputStream::Skip(int signedCount) { - if (signedCount < 0) { - return false; - } - uint64_t count = static_cast(signedCount); - position = std::min(position + count, length); - pushBack = 0; - return position < length; - } - - int64_t SeekableFileInputStream::ByteCount() const { - return static_cast(position); - } - - void SeekableFileInputStream::seek(PositionProvider& location) { - position = location.next(); - if (position > length) { - position = length; - throw std::logic_error("seek too far"); - } - pushBack = 0; - } - - std::string SeekableFileInputStream::getName() const { - std::ostringstream result; - result << input->getName() << " from " << start << " for " - << length; - return result.str(); - } - enum DecompressState { DECOMPRESS_HEADER, DECOMPRESS_START, DECOMPRESS_CONTINUE, diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh index efd374a6f5..c02c9a4edc 100644 --- a/c++/src/Compression.hh +++ b/c++/src/Compression.hh @@ -19,100 +19,10 @@ #ifndef ORC_COMPRESSION_HH #define ORC_COMPRESSION_HH -#include "orc/OrcFile.hh" - -#include "Adaptor.hh" -#include "wrap/zero-copy-stream-wrapper.h" - -#include -#include -#include -#include -#include -#include +#include "Stream.hh" namespace orc { - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length); - - class PositionProvider { - private: - std::list::const_iterator position; - public: - PositionProvider(const std::list& positions); - uint64_t next(); - }; - - /** - * A subclass of Google's ZeroCopyInputStream that supports seek. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf readers. - */ - class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { - public: - virtual ~SeekableInputStream(); - virtual void seek(PositionProvider& position) = 0; - virtual std::string getName() const = 0; - }; - - /** - * Create a seekable input stream based on a memory range. - */ - class SeekableArrayInputStream: public SeekableInputStream { - private: - const char* data; - uint64_t length; - uint64_t position; - uint64_t blockSize; - - public: - SeekableArrayInputStream(const unsigned char* list, - uint64_t length, - uint64_t block_size = 0); - SeekableArrayInputStream(const char* list, - uint64_t length, - uint64_t block_size = 0); - virtual ~SeekableArrayInputStream(); - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual google::protobuf::int64 ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - - /** - * Create a seekable input stream based on an input stream. - */ - class SeekableFileInputStream: public SeekableInputStream { - private: - MemoryPool& pool; - InputStream* const input; - const uint64_t start; - const uint64_t length; - const uint64_t blockSize; - std::unique_ptr > buffer; - uint64_t position; - uint64_t pushBack; - - public: - SeekableFileInputStream(InputStream* input, - uint64_t offset, - uint64_t byteCount, - MemoryPool& pool, - uint64_t blockSize = 0); - virtual ~SeekableFileInputStream(); - - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - /** * Create a decompressor for the given compression kind. * @param kind the compression type to implement diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc index ece78505b8..5ef71e6d5e 100644 --- a/c++/src/Int128.cc +++ b/c++/src/Int128.cc @@ -435,4 +435,76 @@ namespace orc { return buf.str(); } + Int128 scaleInt128(Int128 value, + int32_t scale, + bool &overflow) { + overflow = false; + + while (scale > 0) { + int32_t step = std::min(scale, MAX_PRECISION_64); + if (value > 0 && Int128::maximumValue() / POWERS_OF_TEN[step] < value) { + overflow = true; + return Int128::maximumValue(); + } else if (value < 0 && Int128::minimumValue() / POWERS_OF_TEN[step] > value) { + overflow = true; + return Int128::minimumValue(); + } + + value *= POWERS_OF_TEN[step]; + scale -= step; + } + + return value; + } + + Int128 downScaleInt128(Int128 value, int32_t scale) { + while (scale > 0) { + int32_t step = std::min(std::abs(scale), MAX_PRECISION_64); + value /= POWERS_OF_TEN[step]; + scale -= step; + } + return value; + } + + int32_t decimalCompare(Int128 lValue, int32_t lScale, + Int128 rValue, int32_t rScale) { + // compare integral parts + Int128 lIntegral = downScaleInt128(lValue, lScale); + Int128 rIntegral = downScaleInt128(rValue, rScale); + + if (lIntegral < rIntegral) { + return -1; + } else if (lIntegral > rIntegral) { + return 1; + } + + // integral parts are equal, continue comparing fractional parts + // unneccessary to check overflow here because the scaled number will not + // exceed original ones + bool overflow = false, positive = lValue >= 0; + lValue -= scaleInt128(lIntegral, lScale, overflow); + rValue -= scaleInt128(rIntegral, rScale, overflow); + + int32_t diff = lScale - rScale; + if (diff > 0) { + rValue = scaleInt128(rValue, diff, overflow); + if (overflow) { + return positive ? -1 : 1; + } + } else { + lValue = scaleInt128(lValue, -diff, overflow); + if (overflow) { + return positive ? 1 : -1; + } + } + + if (lValue < rValue) { + return -1; + } else if (lValue > rValue) { + return 1; + } else { + return 0; + } + } + } diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 034586d013..5edb9df8c1 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -35,48 +35,6 @@ namespace orc { - std::string compressionKindToString(CompressionKind kind) { - switch (static_cast(kind)) { - case CompressionKind_NONE: - return "none"; - case CompressionKind_ZLIB: - return "zlib"; - case CompressionKind_SNAPPY: - return "snappy"; - case CompressionKind_LZO: - return "lzo"; - case CompressionKind_LZ4: - return "lz4"; - case CompressionKind_ZSTD: - return "zstd"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string writerVersionToString(WriterVersion version) { - switch (static_cast(version)) { - case WriterVersion_ORIGINAL: - return "original"; - case WriterVersion_HIVE_8732: - return "HIVE-8732"; - case WriterVersion_HIVE_4243: - return "HIVE-4243"; - case WriterVersion_HIVE_12055: - return "HIVE-12055"; - case WriterVersion_HIVE_13083: - return "HIVE-13083"; - case WriterVersion_ORC_101: - return "ORC-101"; - case WriterVersion_ORC_135: - return "ORC-135"; - } - std::stringstream buffer; - buffer << "future - " << version; - return buffer.str(); - } - uint64_t getCompressionBlockSize(const proto::PostScript& ps) { if (ps.has_compressionblocksize()) { return ps.compressionblocksize(); @@ -951,46 +909,6 @@ namespace orc { postscriptLength)); } - std::string streamKindToString(StreamKind kind) { - switch (static_cast(kind)) { - case StreamKind_PRESENT: - return "present"; - case StreamKind_DATA: - return "data"; - case StreamKind_LENGTH: - return "length"; - case StreamKind_DICTIONARY_DATA: - return "dictionary"; - case StreamKind_DICTIONARY_COUNT: - return "dictionary count"; - case StreamKind_SECONDARY: - return "secondary"; - case StreamKind_ROW_INDEX: - return "index"; - case StreamKind_BLOOM_FILTER: - return "bloom"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string columnEncodingKindToString(ColumnEncodingKind kind) { - switch (static_cast(kind)) { - case ColumnEncodingKind_DIRECT: - return "direct"; - case ColumnEncodingKind_DICTIONARY: - return "dictionary"; - case ColumnEncodingKind_DIRECT_V2: - return "direct rle2"; - case ColumnEncodingKind_DICTIONARY_V2: - return "dictionary rle2"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - RowReader::~RowReader() { // PASS } diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 083b722a44..83788c98ab 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -146,87 +146,94 @@ namespace orc { } ColumnStatisticsImpl::ColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { + (const proto::ColumnStatistics& pb) { valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_binarystatistics() || !correctStats) { - _hasTotalLength = false; - + hasTotalLengthValue = false; totalLength = 0; }else{ - _hasTotalLength = pb.binarystatistics().has_sum(); + hasTotalLengthValue = pb.binarystatistics().has_sum(); totalLength = static_cast(pb.binarystatistics().sum()); } } BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_bucketstatistics() || !correctStats) { - _hasCount = false; + hasCountValue = false; trueCount = 0; }else{ - _hasCount = true; + hasCountValue = true; trueCount = pb.bucketstatistics().count(0); } } DateColumnStatisticsImpl::DateColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_datestatistics() || !correctStats) { - _hasMinimum = false; - _hasMaximum = false; + hasMinimumValue = false; + hasMaximumValue = false; minimum = 0; maximum = 0; } else { - _hasMinimum = pb.datestatistics().has_minimum(); - _hasMaximum = pb.datestatistics().has_maximum(); + hasMinimumValue = pb.datestatistics().has_minimum(); + hasMaximumValue = pb.datestatistics().has_maximum(); minimum = pb.datestatistics().minimum(); maximum = pb.datestatistics().maximum(); } } DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats): minimum(0, 0), + maximum(0, 0), + sum(0, 0){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_decimalstatistics() || !correctStats) { - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; + hasMinimumValue = false; + hasMaximumValue = false; + hasSumValue = false; }else{ const proto::DecimalStatistics& stats = pb.decimalstatistics(); - _hasMinimum = stats.has_minimum(); - _hasMaximum = stats.has_maximum(); - _hasSum = stats.has_sum(); + hasMinimumValue = stats.has_minimum(); + hasMaximumValue = stats.has_maximum(); + hasSumValue = stats.has_sum(); - minimum = stats.minimum(); - maximum = stats.maximum(); - sum = stats.sum(); + minimum = Decimal(stats.minimum()); + maximum = Decimal(stats.maximum()); + sum = Decimal(stats.sum()); } } DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ + (const proto::ColumnStatistics& pb){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_doublestatistics()) { - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; + hasMinimumValue = false; + hasMaximumValue = false; + hasSumValue = false; minimum = 0; maximum = 0; sum = 0; }else{ const proto::DoubleStatistics& stats = pb.doublestatistics(); - _hasMinimum = stats.has_minimum(); - _hasMaximum = stats.has_maximum(); - _hasSum = stats.has_sum(); + hasMinimumValue = stats.has_minimum(); + hasMaximumValue = stats.has_maximum(); + hasSumValue = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -235,21 +242,22 @@ namespace orc { } IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ + (const proto::ColumnStatistics& pb){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_intstatistics()) { - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; + hasMinimumValue = false; + hasMaximumValue = false; + hasSumValue = false; minimum = 0; maximum = 0; sum = 0; }else{ const proto::IntegerStatistics& stats = pb.intstatistics(); - _hasMinimum = stats.has_minimum(); - _hasMaximum = stats.has_maximum(); - _hasSum = stats.has_sum(); + hasMinimumValue = stats.has_minimum(); + hasMaximumValue = stats.has_maximum(); + hasSumValue = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -258,19 +266,20 @@ namespace orc { } StringColumnStatisticsImpl::StringColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_stringstatistics() || !correctStats) { - _hasMinimum = false; - _hasMaximum = false; - _hasTotalLength = false; + hasMinimumValue = false; + hasMaximumValue = false; + hasTotalLen = false; totalLength = 0; }else{ const proto::StringStatistics& stats = pb.stringstatistics(); - _hasMinimum = stats.has_minimum(); - _hasMaximum = stats.has_maximum(); - _hasTotalLength = stats.has_sum(); + hasMinimumValue = stats.has_minimum(); + hasMaximumValue = stats.has_maximum(); + hasTotalLen = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -279,21 +288,67 @@ namespace orc { } TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats) { + (const proto::ColumnStatistics& pb, bool correctStats) { valueCount = pb.numberofvalues(); + hasNullValue = pb.hasnull(); if (!pb.has_timestampstatistics() || !correctStats) { - _hasMinimum = false; - _hasMaximum = false; + hasMinimumValue = false; + hasMaximumValue = false; minimum = 0; maximum = 0; }else{ const proto::TimestampStatistics& stats = pb.timestampstatistics(); - _hasMinimum = stats.has_minimum(); - _hasMaximum = stats.has_maximum(); + hasMinimumValue = stats.has_minimum(); + hasMaximumValue = stats.has_maximum(); minimum = stats.minimum(); maximum = stats.maximum(); } } + std::unique_ptr createColumnStatistics( + const Type& type, bool enableStrCmp) { + switch (static_cast(type.getKind())) { + case BOOLEAN: + return std::unique_ptr( + new BooleanColumnStatisticsImpl()); + case BYTE: + case INT: + case LONG: + case SHORT: + return std::unique_ptr( + new IntegerColumnStatisticsImpl()); + case STRUCT: + case MAP: + case LIST: + case UNION: + return std::unique_ptr( + new ColumnStatisticsImpl()); + case FLOAT: + case DOUBLE: + return std::unique_ptr( + new DoubleColumnStatisticsImpl()); + case BINARY: + return std::unique_ptr( + new BinaryColumnStatisticsImpl()); + case STRING: + case CHAR: + case VARCHAR: + return std::unique_ptr( + new StringColumnStatisticsImpl(enableStrCmp)); + case DATE: + return std::unique_ptr( + new DateColumnStatisticsImpl()); + case TIMESTAMP: + return std::unique_ptr( + new TimestampColumnStatisticsImpl()); + case DECIMAL: + return std::unique_ptr( + new DecimalColumnStatisticsImpl()); + default: + throw NotImplementedYet("Not supported type " + type.toString() + + " for ColumnStatistics"); + } + } + }// namespace diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 1d1006ea2c..c44cffcd8a 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -32,55 +32,87 @@ namespace orc { */ class ColumnStatisticsImpl: public ColumnStatistics { - private: - uint64_t valueCount; - public: + ColumnStatisticsImpl() { + reset(); + } ColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~ColumnStatisticsImpl(); - uint64_t getNumberOfValues() const override { - return valueCount; - } - std::string toString() const override { std::ostringstream buffer; - buffer << "Column has " << valueCount << " values" << std::endl; + buffer << "Column has " << valueCount << " values" + << " and has null value: " << (hasNullValue ? "yes " : "no") + << std::endl; return buffer.str(); } }; class BinaryColumnStatisticsImpl: public BinaryColumnStatistics { private: - bool _hasTotalLength; - uint64_t valueCount; + bool hasTotalLengthValue; uint64_t totalLength; public: + BinaryColumnStatisticsImpl() { + reset(); + } BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~BinaryColumnStatisticsImpl(); bool hasTotalLength() const override { - return _hasTotalLength; + return hasTotalLengthValue; } - uint64_t getNumberOfValues() const override { - return valueCount; + + void setHasTotalLength(bool newHasTotalLength) override { + hasTotalLengthValue = newHasTotalLength; } uint64_t getTotalLength() const override { - if(_hasTotalLength){ + if(hasTotalLengthValue){ return totalLength; }else{ throw ParseError("Total length is not defined."); } } + void setTotalLength(uint64_t length) override { + this->totalLength = length; + } + + void reset() override { + ColumnStatistics::reset(); + hasTotalLengthValue = false; + totalLength = 0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const BinaryColumnStatistics& binColStats = + dynamic_cast(other); + + totalLength += binColStats.getTotalLength(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); + binStats->set_sum(static_cast(totalLength)); + } + + void update(const char*, size_t length) override { + totalLength += length; + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Binary" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasTotalLength){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasTotalLengthValue){ buffer << "Total length: " << totalLength << std::endl; }else{ buffer << "Total length: not defined" << std::endl; @@ -91,24 +123,23 @@ namespace orc { class BooleanColumnStatisticsImpl: public BooleanColumnStatistics { private: - bool _hasCount; - uint64_t valueCount; + bool hasCountValue; uint64_t trueCount; public: - BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); + BooleanColumnStatisticsImpl() { + reset(); + } + BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, + bool correctStats); virtual ~BooleanColumnStatisticsImpl(); bool hasCount() const override { - return _hasCount; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasCountValue; } uint64_t getFalseCount() const override { - if(_hasCount){ + if(hasCountValue){ return valueCount - trueCount; }else{ throw ParseError("False count is not defined."); @@ -116,18 +147,57 @@ namespace orc { } uint64_t getTrueCount() const override { - if(_hasCount){ + if(hasCountValue){ return trueCount; }else{ throw ParseError("True count is not defined."); } } + virtual void setTrueCount(uint64_t count) override { + this->trueCount = count; + } + + void setHasCount(bool hasCount) override { + this->hasCountValue = hasCount; + } + + void reset() override { + ColumnStatistics::reset(); + hasCountValue = true; + trueCount = 0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const BooleanColumnStatistics& boolStats = + dynamic_cast(other); + + trueCount += boolStats.getTrueCount(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); + if (hasCountValue) { + bucketStats->add_count(trueCount); + } + } + + void update(bool value, size_t repetitions) override { + if (value) { + trueCount += repetitions; + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Boolean" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasCount){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasCountValue){ buffer << "(true: " << trueCount << "; false: " << valueCount - trueCount << ")" << std::endl; } else { @@ -140,30 +210,29 @@ namespace orc { class DateColumnStatisticsImpl: public DateColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - uint64_t valueCount; + bool hasMinimumValue; + bool hasMaximumValue; int32_t minimum; int32_t maximum; public: - DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); + DateColumnStatisticsImpl() { + reset(); + } + DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, + bool correctStats); virtual ~DateColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasMaximumValue; } int32_t getMinimum() const override { - if(_hasMinimum){ + if(hasMinimumValue){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -171,24 +240,86 @@ namespace orc { } int32_t getMaximum() const override { - if(_hasMaximum){ + if(hasMaximumValue){ return maximum; }else{ throw ParseError("Maximum is not defined."); } } + void setMinimum(int32_t min) override { + this->minimum = min; + this->hasMinimumValue = true; + } + + void setMaximum(int32_t max) override { + this->maximum = max; + this->hasMaximumValue = true; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + minimum = std::numeric_limits::min(); + maximum = std::numeric_limits::max(); + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const DateColumnStatistics& dateStats = + dynamic_cast(other); + + if (dateStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = dateStats.getMinimum(); + maximum = dateStats.getMaximum(); + } else { + if (dateStats.getMaximum() > maximum) { + maximum = dateStats.getMaximum(); + } + if (dateStats.getMinimum() < minimum) { + minimum = dateStats.getMinimum(); + } + } + } + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + if (hasMinimumValue) { + proto::DateStatistics* dateStatistics = pbStats.mutable_datestatistics(); + dateStatistics->set_maximum(maximum); + dateStatistics->set_minimum(minimum); + } + } + + void update(int32_t value) override { + if (!hasMinimumValue) { + maximum = minimum = value; + hasMaximumValue = hasMinimumValue = true; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Date" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasMinimumValue){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(_hasMaximum){ + if(hasMaximumValue){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; @@ -199,116 +330,246 @@ namespace orc { class DecimalColumnStatisticsImpl: public DecimalColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - uint64_t valueCount; - std::string minimum; - std::string maximum; - std::string sum; + bool hasMinimumValue; + bool hasMaximumValue; + bool hasSumValue; + Decimal minimum; + Decimal maximum; + Decimal sum; public: - DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); + DecimalColumnStatisticsImpl(): minimum(0, 0), + maximum(0, 0), + sum(0, 0) { + reset(); + } + DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, + bool correctStats); virtual ~DecimalColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; + return hasMaximumValue; } bool hasSum() const override { - return _hasSum; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasSumValue; } Decimal getMinimum() const override { - if(_hasMinimum){ - return Decimal(minimum); + if(hasMinimumValue){ + return minimum; }else{ throw ParseError("Minimum is not defined."); } } Decimal getMaximum() const override { - if(_hasMaximum){ - return Decimal(maximum); + if(hasMaximumValue){ + return maximum; }else{ throw ParseError("Maximum is not defined."); } } Decimal getSum() const override { - if(_hasSum){ - return Decimal(sum); + if(hasSumValue){ + return sum; }else{ throw ParseError("Sum is not defined."); } } + void setMinimum(Decimal min) override { + this->hasMinimumValue = true; + minimum = min; + } + + void setMaximum(Decimal max) override { + this->hasMaximumValue = true; + maximum = max; + } + + void setSum(Decimal newSum) override { + this->hasSumValue = true; + sum = newSum; + } + + void setHasSum(bool hasSum) override { + this->hasSumValue = hasSum; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + hasSumValue = true; + maximum = Decimal(0, 0); + minimum = Decimal(0, 0); + sum = Decimal(0, 0); + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const DecimalColumnStatistics& decStats = + dynamic_cast(other); + + if (decStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = decStats.getMinimum(); + maximum = decStats.getMaximum(); + } else { + if (decimalCompare(maximum.value, + maximum.scale, + decStats.getMaximum().value, + decStats.getMaximum().scale) < 0) { + maximum = decStats.getMaximum(); + } + if (decimalCompare(minimum.value, + minimum.scale, + decStats.getMinimum().value, + decStats.getMinimum().scale) > 0) { + minimum = decStats.getMinimum(); + } + } + } + + // hasSumValue here means no overflow + hasSumValue &= decStats.hasSum(); + if (hasSumValue) { + updateSum(decStats.getSum().value, decStats.getSum().scale); + } + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); + if (hasMinimumValue) { + decStats->set_minimum(minimum.toString()); + decStats->set_maximum(maximum.toString()); + } + if (hasSumValue) { + decStats->set_sum(sum.toString()); + } + } + + void update(const Decimal& decimal) override { + update(decimal.value, decimal.scale); + } + + void update(int64_t value, int32_t scale) override { + update(Int128(value), scale); + } + + void update(Int128 value, int32_t scale) override { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = maximum = Decimal(value, scale); + } else { + if (decimalCompare(value, + scale, + minimum.value, + minimum.scale) < 0) { + minimum = Decimal(value, scale); + } else if (decimalCompare(maximum.value, + maximum.scale, + value, + scale) < 0) { + maximum = Decimal(value, scale); + } + } + + if (hasSumValue) { + updateSum(value, scale); + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Decimal" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ - buffer << "Minimum: " << minimum << std::endl; + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasMinimumValue){ + buffer << "Minimum: " << minimum.toString() << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(_hasMaximum){ - buffer << "Maximum: " << maximum << std::endl; + if(hasMaximumValue){ + buffer << "Maximum: " << maximum.toString() << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(_hasSum){ - buffer << "Sum: " << sum << std::endl; + if(hasSumValue){ + buffer << "Sum: " << sum.toString() << std::endl; }else{ buffer << "Sum: not defined" << std::endl; } return buffer.str(); } + + private: + void updateSum(Int128 value, int32_t scale) { + if (hasSumValue) { + bool overflow = false; + if (sum.scale > scale) { + value = scaleInt128(value, sum.scale - scale, overflow); + } else if (sum.scale < scale) { + sum.value = scaleInt128(sum.value, scale - sum.scale, overflow); + sum.scale = scale; + } + + if (!overflow) { + bool wasPositive = sum.value >= 0; + sum.value += value; + if ((value >= 0) == wasPositive) { + hasSumValue = (sum.value >= 0) == wasPositive; + } + } else { + hasSumValue = false; + } + } + } }; class DoubleColumnStatisticsImpl: public DoubleColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - uint64_t valueCount; + bool hasMinimumValue; + bool hasMaximumValue; + bool hasSumValue; double minimum; double maximum; double sum; public: + DoubleColumnStatisticsImpl() { + reset(); + } DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~DoubleColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; + return hasMaximumValue; } bool hasSum() const override { - return _hasSum; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasSumValue; } double getMinimum() const override { - if(_hasMinimum){ + if(hasMinimumValue){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -316,7 +577,7 @@ namespace orc { } double getMaximum() const override { - if(_hasMaximum){ + if(hasMaximumValue){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -324,30 +585,109 @@ namespace orc { } double getSum() const override { - if(_hasSum){ + if(hasSumValue){ return sum; }else{ throw ParseError("Sum is not defined."); } } + void setMinimum(double min) override { + this->minimum = min; + this->hasMinimumValue = true; + } + + void setMaximum(double max) override { + this->maximum = max; + this->hasMaximumValue = true; + } + + void setSum(double newSum) override { + this->sum = newSum; + this->hasSumValue = true; + } + + void setHasSum(bool hasSum) override { + this->hasSumValue = hasSum; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + minimum = std::numeric_limits::min(); + maximum = std::numeric_limits::max(); + hasSumValue = true; + sum = 0.0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const DoubleColumnStatistics& doubleColStats = + dynamic_cast(other); + + if (doubleColStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = doubleColStats.getMinimum(); + maximum = doubleColStats.getMaximum(); + } else { + if (doubleColStats.getMaximum() > maximum) { + maximum = doubleColStats.getMaximum(); + } + if (doubleColStats.getMinimum() < minimum) { + minimum = doubleColStats.getMinimum(); + } + } + } + + sum += doubleColStats.getSum(); + } + + void update(double value) override { + if (!hasMinimumValue) { + maximum = minimum = value; + hasMaximumValue = hasMinimumValue = true; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + sum += value; + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); + if (hasMinimumValue) { + doubleStats->set_minimum(minimum); + doubleStats->set_maximum(maximum); + } + if (hasSumValue) { + doubleStats->set_sum(sum); + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Double" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasMinimumValue){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(_hasMaximum){ + if(hasMaximumValue){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(_hasSum){ + if(hasSumValue){ buffer << "Sum: " << sum << std::endl; }else{ buffer << "Sum: not defined" << std::endl; @@ -358,36 +698,34 @@ namespace orc { class IntegerColumnStatisticsImpl: public IntegerColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - uint64_t valueCount; + bool hasMinimumValue; + bool hasMaximumValue; + bool hasSumValue; int64_t minimum; int64_t maximum; int64_t sum; public: + IntegerColumnStatisticsImpl() { + reset(); + } IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~IntegerColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; + return hasMaximumValue; } bool hasSum() const override { - return _hasSum; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasSumValue; } int64_t getMinimum() const override { - if(_hasMinimum){ + if(hasMinimumValue){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -395,7 +733,7 @@ namespace orc { } int64_t getMaximum() const override { - if(_hasMaximum){ + if(hasMaximumValue){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -403,30 +741,124 @@ namespace orc { } int64_t getSum() const override { - if(_hasSum){ + if(hasSumValue){ return sum; }else{ throw ParseError("Sum is not defined."); } } + void setMinimum(int64_t min) override { + this->minimum = min; + this->hasMinimumValue = true; + } + + void setMaximum(int64_t max) override { + this->maximum = max; + this->hasMaximumValue = true; + } + + void setSum(int64_t newSum) override { + this->sum = newSum; + this->hasSumValue = true; + } + + void setHasSum(bool hasSum) override { + this->hasSumValue = hasSum; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + minimum = std::numeric_limits::min(); + maximum = std::numeric_limits::max(); + hasSumValue = true; + sum = 0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const IntegerColumnStatistics& intColStats = + dynamic_cast(other); + + if (intColStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = intColStats.getMinimum(); + maximum = intColStats.getMaximum(); + } else { + if (intColStats.getMaximum() > maximum) { + maximum = intColStats.getMaximum(); + } + if (intColStats.getMinimum() < minimum) { + minimum = intColStats.getMinimum(); + } + } + } + + // hasSumValue here means no overflow + hasSumValue &= intColStats.hasSum(); + if (hasSumValue) { + bool wasPositive = sum >= 0; + sum += intColStats.getSum(); + if ((intColStats.getSum() >= 0) == wasPositive) { + hasSumValue = (sum >= 0) == wasPositive; + } + } + } + + void update(int64_t value, int repetitions) override { + if (!hasMinimumValue) { + maximum = minimum = value; + hasMaximumValue = hasMinimumValue = true; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + if (hasSumValue) { + bool wasPositive = sum >= 0; + sum += value * repetitions; + if ((value >= 0) == wasPositive) { + hasSumValue = (sum >= 0) == wasPositive; + } + } + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); + if (hasMinimumValue) { + intStats->set_minimum(minimum); + intStats->set_maximum(maximum); + } + if (hasSumValue) { + intStats->set_sum(sum); + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Integer" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + + if(hasMinimumValue){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(_hasMaximum){ + if(hasMaximumValue){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(_hasSum){ + if(hasSumValue){ buffer << "Sum: " << sum << std::endl; }else{ buffer << "Sum: not defined" << std::endl; @@ -435,38 +867,39 @@ namespace orc { } }; + class StringColumnStatisticsImpl: public StringColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - bool _hasTotalLength; - uint64_t valueCount; + bool hasMinimumValue; + bool hasMaximumValue; + bool hasTotalLen; std::string minimum; std::string maximum; uint64_t totalLength; public: - StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); + StringColumnStatisticsImpl(bool enableStrComparision) { + enableStringComparison = enableStrComparision; + reset(); + } + StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, + bool correctStats); virtual ~StringColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; + return hasMaximumValue; } bool hasTotalLength() const override { - return _hasTotalLength; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasTotalLen; } std::string getMinimum() const override { - if(_hasMinimum){ + if(hasMinimumValue){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -474,7 +907,7 @@ namespace orc { } std::string getMaximum() const override { - if(_hasMaximum){ + if(hasMaximumValue){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -482,65 +915,178 @@ namespace orc { } uint64_t getTotalLength() const override { - if(_hasTotalLength){ + if(hasTotalLen){ return totalLength; }else{ throw ParseError("Total length is not defined."); } } + void setMinimum(std::string min) override { + this->minimum = min; + this->hasMinimumValue = true; + } + + void setMaximum(std::string max) override { + this->maximum = max; + this->hasMaximumValue = true; + } + + void setTotalLength(uint64_t newTotalLength) override { + this->totalLength = newTotalLength; + this->hasTotalLen = true; + } + + void setHasTotalLength(bool newHasTotalLength) override { + this->hasTotalLen = newHasTotalLength; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + minimum = std::string(); + maximum = std::string(); + hasTotalLen = true; + totalLength = 0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const StringColumnStatistics& strColStats = + dynamic_cast(other); + + if (enableStringComparison) { + if (strColStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = strColStats.getMinimum(); + maximum = strColStats.getMaximum(); + } else { + if (strColStats.getMaximum() > maximum) { + maximum = strColStats.getMaximum(); + } + if (strColStats.getMinimum() < minimum) { + minimum = strColStats.getMinimum(); + } + } + } + } + + totalLength += strColStats.getTotalLength(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); + if (hasMinimumValue) { + strStats->set_minimum(minimum); + strStats->set_maximum(maximum); + } + + strStats->set_sum(static_cast(totalLength)); + } + + void update(const std::string& value) override { + if (enableStringComparison) { + if (!hasMinimumValue) { + maximum = minimum = value; + hasMaximumValue = hasMinimumValue = true; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + } + + totalLength += value.length(); + } + + void update(const char* value, size_t length) override { + if (enableStringComparison && value != nullptr) { + if (!hasMinimumValue) { + maximum = minimum = std::string(value, value + length); + hasMaximumValue = hasMinimumValue = true; + } else { + // update min + int minCmp = strncmp(minimum.c_str(), + value, + std::min(minimum.length(), length)); + if (minCmp > 0 || (minCmp == 0 && length < minimum.length())) { + minimum = std::string(value, value + length); + } + + // update max + int maxCmp = strncmp(maximum.c_str(), + value, + std::min(maximum.length(), length)); + if (maxCmp < 0 || (maxCmp == 0 && length > minimum.length())) { + maximum = std::string(value, value + length); + } + } + } + + totalLength += length; + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: String" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasMinimumValue){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum is not defined" << std::endl; } - if(_hasMaximum){ + if(hasMaximumValue){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum is not defined" << std::endl; } - if(_hasTotalLength){ + if(hasTotalLen){ buffer << "Total length: " << totalLength << std::endl; }else{ buffer << "Total length is not defined" << std::endl; } return buffer.str(); } + + private: + // a flag to enable string comparision for min/max as it is very + // time-consuming, can be off by default + bool enableStringComparison; }; class TimestampColumnStatisticsImpl: public TimestampColumnStatistics { private: - bool _hasMinimum; - bool _hasMaximum; - uint64_t valueCount; + bool hasMinimumValue; + bool hasMaximumValue; int64_t minimum; int64_t maximum; public: + TimestampColumnStatisticsImpl() { + reset(); + } TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~TimestampColumnStatisticsImpl(); bool hasMinimum() const override { - return _hasMinimum; + return hasMinimumValue; } bool hasMaximum() const override { - return _hasMaximum; - } - - uint64_t getNumberOfValues() const override { - return valueCount; + return hasMaximumValue; } int64_t getMinimum() const override { - if(_hasMinimum){ + if(hasMinimumValue){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -548,24 +1094,89 @@ namespace orc { } int64_t getMaximum() const override { - if(_hasMaximum){ + if(hasMaximumValue){ return maximum; }else{ throw ParseError("Maximum is not defined."); } } + void setMinimum(int64_t min) override { + this->minimum = min; + this->hasMinimumValue = true; + } + + void setMaximum(int64_t max) override { + this->maximum = max; + this->hasMaximumValue = true; + } + + void reset() override { + ColumnStatistics::reset(); + hasMinimumValue = false; + hasMaximumValue = false; + minimum = 0; + maximum = 0; + } + + void merge(const ColumnStatistics& other) override { + ColumnStatistics::merge(other); + + const TimestampColumnStatistics& tsStats = + dynamic_cast(other); + + if (tsStats.hasMinimum()) { + if (!hasMinimumValue) { + hasMinimumValue = hasMaximumValue = true; + minimum = tsStats.getMinimum(); + maximum = tsStats.getMaximum(); + } else { + if (tsStats.getMaximum() > maximum) { + maximum = tsStats.getMaximum(); + } + if (tsStats.getMinimum() < minimum) { + minimum = tsStats.getMinimum(); + } + } + } + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + ColumnStatistics::toProtoBuf(pbStats); + + if (hasMinimumValue) { + proto::TimestampStatistics* timestampStatistics = + pbStats.mutable_timestampstatistics(); + + // ORC-135: min and max are deprecated, store UTC instead + timestampStatistics->set_maximumutc(maximum); + timestampStatistics->set_minimumutc(minimum); + } + } + + void update(int64_t value) override { + if (!hasMinimumValue) { + maximum = minimum = value; + hasMaximumValue = hasMinimumValue = true; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + } + std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Timestamp" << std::endl - << "Values: " << valueCount << std::endl; - if(_hasMinimum){ + << "Values: " << valueCount << std::endl + << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; + if(hasMinimumValue){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum is not defined" << std::endl; } - if(_hasMaximum){ + if(hasMaximumValue){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum is not defined" << std::endl; @@ -574,7 +1185,6 @@ namespace orc { } }; - ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, bool correctStats); diff --git a/c++/src/Stream.cc b/c++/src/Stream.cc new file mode 100644 index 0000000000..9e4d0565c5 --- /dev/null +++ b/c++/src/Stream.cc @@ -0,0 +1,222 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Exceptions.hh" +#include "Stream.hh" + +#include +#include + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length) { + const uint64_t width = 24; + out << std::hex; + for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { + out << std::setfill('0') << std::setw(7) << (line * width); + for(uint64_t byte = 0; + byte < width && line * width + byte < length; ++byte) { + out << " " << std::setfill('0') << std::setw(2) + << static_cast(0xff & buffer[line * width + + byte]); + } + out << "\n"; + } + out << std::dec; + } + + PositionProvider::PositionProvider(const std::list& posns) { + position = posns.begin(); + } + + uint64_t PositionProvider::next() { + uint64_t result = *position; + ++position; + return result; + } + + SeekableInputStream::~SeekableInputStream() { + // PASS + } + + SeekableArrayInputStream::~SeekableArrayInputStream() { + // PASS + } + + SeekableArrayInputStream::SeekableArrayInputStream + (const unsigned char* values, + uint64_t size, + uint64_t blkSize + ): data(reinterpret_cast(values)) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast(blkSize); + } + + SeekableArrayInputStream::SeekableArrayInputStream(const char* values, + uint64_t size, + uint64_t blkSize + ): data(values) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast(blkSize); + } + + bool SeekableArrayInputStream::Next(const void** buffer, int*size) { + uint64_t currentSize = std::min(length - position, blockSize); + if (currentSize > 0) { + *buffer = data + position; + *size = static_cast(currentSize); + position += currentSize; + return true; + } + *size = 0; + return false; + } + + void SeekableArrayInputStream::BackUp(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast(count); + if (unsignedCount <= blockSize && unsignedCount <= position) { + position -= unsignedCount; + } else { + throw std::logic_error("Can't backup that much!"); + } + } + } + + bool SeekableArrayInputStream::Skip(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast(count); + if (unsignedCount + position <= length) { + position += unsignedCount; + return true; + } else { + position = length; + } + } + return false; + } + + google::protobuf::int64 SeekableArrayInputStream::ByteCount() const { + return static_cast(position); + } + + void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { + position = seekPosition.next(); + } + + std::string SeekableArrayInputStream::getName() const { + std::ostringstream result; + result << "SeekableArrayInputStream " << position << " of " << length; + return result.str(); + } + + static uint64_t computeBlock(uint64_t request, uint64_t length) { + return std::min(length, request == 0 ? 256 * 1024 : request); + } + + SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, + uint64_t offset, + uint64_t byteCount, + MemoryPool& _pool, + uint64_t _blockSize + ):pool(_pool), + input(stream), + start(offset), + length(byteCount), + blockSize(computeBlock + (_blockSize, + length)) { + + position = 0; + buffer.reset(new DataBuffer(pool)); + pushBack = 0; + } + + SeekableFileInputStream::~SeekableFileInputStream() { + // PASS + } + + bool SeekableFileInputStream::Next(const void** data, int*size) { + uint64_t bytesRead; + if (pushBack != 0) { + *data = buffer->data() + (buffer->size() - pushBack); + bytesRead = pushBack; + } else { + bytesRead = std::min(length - position, blockSize); + buffer->resize(bytesRead); + if (bytesRead > 0) { + input->read(buffer->data(), bytesRead, start+position); + *data = static_cast(buffer->data()); + } + } + position += bytesRead; + pushBack = 0; + *size = static_cast(bytesRead); + return bytesRead != 0; + } + + void SeekableFileInputStream::BackUp(int signedCount) { + if (signedCount < 0) { + throw std::logic_error("can't backup negative distances"); + } + uint64_t count = static_cast(signedCount); + if (pushBack > 0) { + throw std::logic_error("can't backup unless we just called Next"); + } + if (count > blockSize || count > position) { + throw std::logic_error("can't backup that far"); + } + pushBack = static_cast(count); + position -= pushBack; + } + + bool SeekableFileInputStream::Skip(int signedCount) { + if (signedCount < 0) { + return false; + } + uint64_t count = static_cast(signedCount); + position = std::min(position + count, length); + pushBack = 0; + return position < length; + } + + int64_t SeekableFileInputStream::ByteCount() const { + return static_cast(position); + } + + void SeekableFileInputStream::seek(PositionProvider& location) { + position = location.next(); + if (position > length) { + position = length; + throw std::logic_error("seek too far"); + } + pushBack = 0; + } + + std::string SeekableFileInputStream::getName() const { + std::ostringstream result; + result << input->getName() << " from " << start << " for " + << length; + return result.str(); + } + +} diff --git a/c++/src/Stream.hh b/c++/src/Stream.hh new file mode 100644 index 0000000000..368a2c298e --- /dev/null +++ b/c++/src/Stream.hh @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STREAM_HH +#define ORC_STREAM_HH + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "wrap/zero-copy-stream-wrapper.h" + +#include +#include +#include +#include +#include + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length); + + class PositionProvider { + private: + std::list::const_iterator position; + public: + PositionProvider(const std::list& positions); + uint64_t next(); + }; + + /** + * A subclass of Google's ZeroCopyInputStream that supports seek. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf readers. + */ + class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { + public: + virtual ~SeekableInputStream(); + virtual void seek(PositionProvider& position) = 0; + virtual std::string getName() const = 0; + }; + + /** + * Create a seekable input stream based on a memory range. + */ + class SeekableArrayInputStream: public SeekableInputStream { + private: + const char* data; + uint64_t length; + uint64_t position; + uint64_t blockSize; + + public: + SeekableArrayInputStream(const unsigned char* list, + uint64_t length, + uint64_t block_size = 0); + SeekableArrayInputStream(const char* list, + uint64_t length, + uint64_t block_size = 0); + virtual ~SeekableArrayInputStream(); + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual google::protobuf::int64 ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + + /** + * Create a seekable input stream based on an input stream. + */ + class SeekableFileInputStream: public SeekableInputStream { + private: + MemoryPool& pool; + InputStream* const input; + const uint64_t start; + const uint64_t length; + const uint64_t blockSize; + std::unique_ptr > buffer; + uint64_t position; + uint64_t pushBack; + + public: + SeekableFileInputStream(InputStream* input, + uint64_t offset, + uint64_t byteCount, + MemoryPool& pool, + uint64_t blockSize = 0); + virtual ~SeekableFileInputStream(); + + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + +} + +#endif //ORC_STREAM_HH diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index fdf66a0f33..99a25812c9 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -491,4 +491,190 @@ namespace orc { return std::unique_ptr(result); } + Type * Type::buildTypeFromString(const std::string& input) { + std::vector > res = + TypeImpl::buildTypeFromStringImpl(input, 0, input.size()); + if (res.size() != 1) { + throw std::logic_error("Invalid type string."); + } + return res[0].second; + } + + std::vector > + TypeImpl::buildTypeFromStringImpl(const std::string& input, + size_t start, + size_t end) { + std::string types = input.substr(start, end - start); + std::vector > res; + size_t pos = 0; + + while (pos < types.size()) { + size_t endPos = pos; + while (endPos < types.size() && + ((types[endPos] >= 'a' && types[endPos] <= 'z') || + (types[endPos] >= '0' && types[endPos] <= '9'))) { + ++endPos; + } + + std::string fieldName; + if (types[endPos] == ':') { + fieldName = types.substr(pos, endPos - pos); + pos = ++endPos; + while (endPos < types.size() && types[endPos] >= 'a' + && types[endPos] <= 'z') { + ++endPos; + } + } + + size_t nextPos = endPos + 1; + if (types[endPos] == '<') { + int count = 1; + while (nextPos < types.size()) { + if (types[nextPos] == '<') { + ++count; + } else if (types[nextPos] == '>') { + --count; + } + if (count == 0) { + break; + } + ++nextPos; + } + if (nextPos == types.size()) { + throw std::logic_error("Invalid type string. Cannot find closing >"); + } + } else if (types[endPos] == '(') { + while (nextPos < types.size() && types[nextPos] != ')') { + ++nextPos; + } + if (nextPos == types.size()) { + throw std::logic_error("Invalid type string. Cannot find closing )"); + } + } else if (types[endPos] != ',' && types[endPos] != '\0') { + throw std::logic_error("Unrecognize character."); + } + + std::string typeName = types.substr(pos, endPos - pos); + if (typeName == "boolean") { + res.push_back(std::make_pair(fieldName, new TypeImpl(BOOLEAN))); + } else if (typeName == "tinyint") { + res.push_back(std::make_pair(fieldName, new TypeImpl(BYTE))); + } else if (typeName == "smallint") { + res.push_back(std::make_pair(fieldName, new TypeImpl(SHORT))); + } else if (typeName == "int") { + res.push_back(std::make_pair(fieldName, new TypeImpl(INT))); + } else if (typeName == "bigint") { + res.push_back(std::make_pair(fieldName, new TypeImpl(LONG))); + } else if (typeName == "float") { + res.push_back(std::make_pair(fieldName, new TypeImpl(FLOAT))); + } else if (typeName == "double") { + res.push_back(std::make_pair(fieldName, new TypeImpl(DOUBLE))); + } else if (typeName == "string") { + res.push_back(std::make_pair(fieldName, new TypeImpl(STRING))); + } else if (typeName == "binary") { + res.push_back(std::make_pair(fieldName, new TypeImpl(BINARY))); + } else if (typeName == "timestamp") { + res.push_back(std::make_pair(fieldName, new TypeImpl(TIMESTAMP))); + } else if (typeName == "array") { + TypeImpl * arrayType = new TypeImpl(LIST); + std::vector > v = + TypeImpl::buildTypeFromStringImpl( + types, + endPos + 1, + nextPos); + if (v.size() != 1) { + throw std::logic_error( + "Array type must contain exactly one sub type."); + } + arrayType->addChildType(std::unique_ptr(v[0].second)); + res.push_back(std::make_pair(fieldName, arrayType)); + } else if (typeName == "map") { + TypeImpl * mapType = new TypeImpl(MAP); + std::vector > v = + TypeImpl::buildTypeFromStringImpl( + types, + endPos + 1, + nextPos); + if (v.size() != 2) { + throw std::logic_error( + "Map type must contain exactly two sub types."); + } + mapType->addChildType(std::unique_ptr(v[0].second)); + mapType->addChildType(std::unique_ptr(v[1].second)); + res.push_back(std::make_pair(fieldName, mapType)); + } else if (typeName == "struct") { + TypeImpl * structType = new TypeImpl(STRUCT); + std::vector > v = + TypeImpl::buildTypeFromStringImpl( + types, + endPos + 1, + nextPos); + if (v.size() == 0) { + throw std::logic_error( + "Struct type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + structType->addStructField( + v[i].first, + std::unique_ptr(v[i].second)); + } + res.push_back(std::make_pair(fieldName, structType)); + } else if (typeName == "uniontype") { + TypeImpl * unionType = new TypeImpl(UNION); + std::vector > v = + TypeImpl::buildTypeFromStringImpl( + types, + endPos + 1, + nextPos); + if (v.size() == 0) { + throw std::logic_error( + "Union type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + unionType->addChildType(std::unique_ptr(v[i].second)); + } + res.push_back(std::make_pair(fieldName, unionType)); + } else if (typeName == "decimal") { + size_t sep = types.find(',', endPos + 1); + if (sep + 1 >= nextPos || sep == std::string::npos) { + throw std::logic_error( + "Decimal type must specify precision and scale."); + } + uint64_t precision = + static_cast( + atoi( + types.substr(endPos + 1, sep - endPos - 1).c_str())); + uint64_t scale = + static_cast( + atoi(types.substr(sep + 1, nextPos - sep - 1).c_str())); + TypeImpl * decimalType = new TypeImpl(DECIMAL, precision, scale); + res.push_back(std::make_pair(fieldName, decimalType)); + } else if (typeName == "date") { + res.push_back(std::make_pair(fieldName, new TypeImpl(DATE))); + } else if (typeName == "varchar") { + uint64_t maxLength = static_cast( + atoi( + types.substr(endPos + 1, nextPos - endPos - 1).c_str())); + res.push_back( + std::make_pair(fieldName, new TypeImpl(VARCHAR, maxLength))); + } else if (typeName == "char") { + uint64_t maxLength = static_cast( + atoi( + types.substr(endPos + 1, nextPos - endPos - 1).c_str())); + res.push_back(std::make_pair(fieldName, new TypeImpl(CHAR, maxLength))); + } + else { + throw std::logic_error("Unknown type " + typeName); + } + + if (types[nextPos] == ')' || types[nextPos] == '>') { + pos = nextPos + 2; + } else { + pos = nextPos; + } + } + + return res; + } + } diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index e2866e456d..7d7577fd07 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -98,6 +98,11 @@ namespace orc { */ void addChildType(std::unique_ptr childType); + static std::vector > buildTypeFromStringImpl( + const std::string& input, + size_t start, + size_t end); + private: /** * Assign ids to this node and its children giving this From 3dbc7722df4ce689e58139ea97cb7c20cdd0449d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 27 Apr 2017 20:55:05 -0700 Subject: [PATCH 2/4] Make the diff more concise --- c++/include/orc/Int128.hh | 109 +---- c++/include/orc/Statistics.hh | 284 +---------- c++/src/Int128.cc | 72 --- c++/src/Statistics.cc | 153 ++---- c++/src/Statistics.hh | 892 ++++++---------------------------- 5 files changed, 192 insertions(+), 1318 deletions(-) diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh index 023f2a21d6..70793dcdd6 100644 --- a/c++/include/orc/Int128.hh +++ b/c++/include/orc/Int128.hh @@ -150,26 +150,6 @@ namespace orc { */ Int128 divide(const Int128 &right, Int128& remainder) const; - /** - * Divide this number by right and return the integral part. This operation - * is destructive. - * @param right the number to divide by - */ - Int128& operator/=(const Int128 &right) { - *this = operator/(right); - return *this; - } - - /** - * Divide this number by right and return the integral part. This operation - * is not destructive. - * @param right the number to divide by - */ - Int128 operator/(const Int128 &right) const { - Int128 remainder; - return divide(right, remainder); - } - /** * Logical or between two Int128. * @param right the number to or in @@ -181,20 +161,10 @@ namespace orc { return *this; } - /** - * Logical or between two Int128. - * @param right the number to or in - * @return result - */ - Int128 operator|(const Int128 right) { - Int128 value = *this; - value |= right; - return value; - } - /** * Logical and between two Int128. * @param right the number to and in + * @return *this */ Int128& operator&=(const Int128 &right) { lowbits &= right.lowbits; @@ -202,16 +172,6 @@ namespace orc { return *this; } - /** - * Logical and between two Int128. - * @param right the number to and in - */ - Int128 operator&(const Int128 &right) { - Int128 value = *this; - value &= right; - return value; - } - /** * Shift left by the given number of bits. * Values larger than 2**127 will shift into the sign bit. @@ -233,16 +193,6 @@ namespace orc { return *this; } - /** - * Shift left by the given number of bits. - * Values larger than 2**127 will shift into the sign bit. - */ - Int128 operator<<(uint32_t bits) { - Int128 value = *this; - value <<= bits; - return value; - } - /** * Shift right by the given number of bits. Negative values will * sign extend and fill with one bits. @@ -265,16 +215,6 @@ namespace orc { return *this; } - /** - * Shift right by the given number of bits. Negative values will - * sign extend and fill with one bits. - */ - Int128 operator>>(uint32_t bits) { - Int128 value = *this; - value >>= bits; - return value; - } - bool operator==(const Int128& right) const { return highbits == right.highbits && lowbits == right.lowbits; } @@ -392,52 +332,5 @@ namespace orc { int64_t highbits; uint64_t lowbits; }; - - /** - * Scales an Int128 value - * @param value the Int128 value to scale - * @param scaleMultiplier the scale offset. Result of a negative - * scaleMultiplier is undefined. - * @param overflow returns whether the result overflows or not - * @return the scaled value - */ - Int128 scaleInt128(Int128 value, - int32_t scaleMultiplier, - bool &overflow); - - /** - * Compares two decimals - * @param lValue the integer representation of left decimal - * @param lScale the scale of left decimal - * @param rValue the integer representation of right decimal - * @param rScale the scale of right decimal - * @return -1 if left decimal is smaller, or - * 1 if right decimal is smaller, or - * 0 if they are equal - */ - int32_t decimalCompare(Int128 lValue, int32_t lScale, - Int128 rValue, int32_t rScale); - - const int32_t MAX_PRECISION_64 = 18; - const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; } #endif diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 489838f252..c65eb3ce7d 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -37,76 +37,12 @@ namespace orc { * of rows because of NULL values and repeated values. * @return the number of values */ - virtual uint64_t getNumberOfValues() const { - return valueCount; - } - - /** - * Set the number of values in this column - * @param newValueCount new number of values to be set - */ - virtual void setNumberOfValues(uint64_t newValueCount) { - valueCount = newValueCount; - } - - /** - * Check whether column has null value - * @return true if has null value - */ - virtual bool hasNull() const { - return hasNullValue; - } - - /** - * Set whether column has null value - * @param newHasNull has null value - */ - virtual void setHasNull(bool newHasNull) { - hasNullValue = newHasNull; - } + virtual uint64_t getNumberOfValues() const = 0; /** * print out statistics of column if any */ virtual std::string toString() const = 0; - - /** - * Increases count of values - * @param count number of values to be increased - */ - virtual void increase(uint64_t count) { - valueCount += count; - } - - /** - * reset column statistics to initial state - */ - virtual void reset() { - hasNullValue = false; - valueCount = 0; - } - - /** - * Merges another statistics - * @param other statistics to be merged - */ - virtual void merge(const ColumnStatistics& other) { - hasNullValue |= other.hasNull(); - valueCount += other.getNumberOfValues(); - } - - /** - * Convert statistics to protobuf version - * @param pbStats output of protobuf stats - */ - virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const { - pbStats.set_hasnull(hasNullValue); - pbStats.set_numberofvalues(valueCount); - } - - protected: - uint64_t valueCount; - bool hasNullValue; }; /** @@ -122,30 +58,7 @@ namespace orc { */ virtual bool hasTotalLength() const = 0; - /** - * set has total length - * @param newHasTotalLength has total length - */ - virtual void setHasTotalLength(bool newHasTotalLength) = 0; - - /** - * get total length - * @return total length - */ virtual uint64_t getTotalLength() const = 0; - - /** - * set total length - * @param newTotalLength new total length value - */ - virtual void setTotalLength(uint64_t newTotalLength) = 0; - - /** - * update stats by a new value - * @param value new value to update - * @param length length of the value - */ - virtual void update(const char* value, size_t length) = 0; }; /** @@ -161,23 +74,8 @@ namespace orc { */ virtual bool hasCount() const = 0; - /** - * set hasCount value - * @param hasCount new hasCount value - */ - virtual void setHasCount(bool hasCount) = 0; - virtual uint64_t getFalseCount() const = 0; virtual uint64_t getTrueCount() const = 0; - - virtual void setTrueCount(uint64_t trueCount) = 0; - - /** - * update stats by a new value - * @param value new value to update - * @param repetitions the repetitions of the boolean value - */ - virtual void update(bool value, size_t repetitions) = 0; }; /** @@ -205,29 +103,11 @@ namespace orc { */ virtual int32_t getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(int32_t min) = 0; - /** * Get the maximum value for the column. * @return maximum value */ virtual int32_t getMaximum() const = 0; - - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(int32_t max) = 0; - - /** - * update stats by a new value - * @param value new value to update - */ - virtual void update(int32_t value) = 0; }; /** @@ -255,67 +135,23 @@ namespace orc { */ virtual bool hasSum() const = 0; - /** - * set hasSum value - * @param newHasSum hasSum value - */ - virtual void setHasSum(bool newHasSum) = 0; - /** * Get the minimum value for the column. * @return minimum value */ virtual Decimal getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(Decimal min) = 0; - /** * Get the maximum value for the column. * @return maximum value */ virtual Decimal getMaximum() const = 0; - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(Decimal max) = 0; - /** * Get the sum for the column. * @return sum of all the values */ virtual Decimal getSum() const = 0; - - /** - * set new sum - * @param newSum sum to be set - */ - virtual void setSum(Decimal newSum) = 0; - - /** - * update stats by a new value - * @param value new value to update - */ - virtual void update(const Decimal& value) = 0; - - /** - * update stats by a new value - * @param value new decimal value represented in Int128 - * @param scale scale of the decimal - */ - virtual void update(Int128 value, int32_t scale) = 0; - - /** - * update stats by a new value - * @param value new decimal value represented in int64_t - * @param scale scale of the decimal - */ - virtual void update(int64_t value, int32_t scale) = 0; }; /** @@ -343,12 +179,6 @@ namespace orc { */ virtual bool hasSum() const = 0; - /** - * set hasSum value - * @param newHasSum hasSum value - */ - virtual void setHasSum(bool newHasSum) = 0; - /** * Get the smallest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -356,12 +186,6 @@ namespace orc { */ virtual double getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(double min) = 0; - /** * Get the largest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -369,29 +193,11 @@ namespace orc { */ virtual double getMaximum() const = 0; - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(double max) = 0; - /** * Get the sum of the values in the column. * @return the sum */ virtual double getSum() const = 0; - - /** - * set new sum - * @param newSum sum to be set - */ - virtual void setSum(double newSum) = 0; - - /** - * update stats by a new value - * @param value new value to update - */ - virtual void update(double value) = 0; }; /** @@ -420,12 +226,6 @@ namespace orc { */ virtual bool hasSum() const = 0; - /** - * set hasSum value - * @param newHasSum hasSum value - */ - virtual void setHasSum(bool newHasSum) = 0; - /** * Get the smallest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -433,12 +233,6 @@ namespace orc { */ virtual int64_t getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(int64_t min) = 0; - /** * Get the largest value in the column. Only defined if getNumberOfValues * is non-zero. @@ -446,30 +240,11 @@ namespace orc { */ virtual int64_t getMaximum() const = 0; - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(int64_t max) = 0; - /** * Get the sum of the column. Only valid if isSumDefined returns true. * @return the sum of the column */ virtual int64_t getSum() const = 0; - - /** - * set new sum - * @param newSum sum to be set - */ - virtual void setSum(int64_t newSum) = 0; - - /** - * update stats by a new value - * @param value new value to update - * @param repetitions repetition of the value - */ - virtual void update(int64_t value, int repetitions) = 0; }; /** @@ -497,60 +272,23 @@ namespace orc { */ virtual bool hasTotalLength() const = 0; - /** - * set has total length - * @param newHasTotalLength has total length - */ - virtual void setHasTotalLength(bool newHasTotalLength) = 0; - /** * Get the minimum value for the column. * @return minimum value */ virtual std::string getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(std::string min) = 0; - /** * Get the maximum value for the column. * @return maximum value */ virtual std::string getMaximum() const = 0; - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(std::string max) = 0; - /** * Get the total length of all values. * @return total length of all the values */ virtual uint64_t getTotalLength() const = 0; - - /** - * set total length - * @param newTotalLength new total length value - */ - virtual void setTotalLength(uint64_t newTotalLength) = 0; - - /** - * update stats by a new value - * @param value new value to update - */ - virtual void update(const std::string& value) = 0; - - /** - * update stats by a new value - * @param value new value to update - * @param length length of the value - */ - virtual void update(const char* value, size_t length) = 0; }; /** @@ -578,29 +316,11 @@ namespace orc { */ virtual int64_t getMinimum() const = 0; - /** - * set new minimum value - * @param min new minimum value - */ - virtual void setMinimum(int64_t min) = 0; - /** * Get the maximum value for the column. * @return maximum value */ virtual int64_t getMaximum() const = 0; - - /** - * set new maximum value - * @param max new maximum value - */ - virtual void setMaximum(int64_t max) = 0; - - /** - * update stats by a new value - * @param value new value to update - */ - virtual void update(int64_t value) = 0; }; class Statistics { @@ -621,8 +341,6 @@ namespace orc { virtual uint32_t getNumberOfColumns() const = 0; }; - std::unique_ptr createColumnStatistics( - const Type& type, bool enableStringComparison); } diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc index 5ef71e6d5e..ece78505b8 100644 --- a/c++/src/Int128.cc +++ b/c++/src/Int128.cc @@ -435,76 +435,4 @@ namespace orc { return buf.str(); } - Int128 scaleInt128(Int128 value, - int32_t scale, - bool &overflow) { - overflow = false; - - while (scale > 0) { - int32_t step = std::min(scale, MAX_PRECISION_64); - if (value > 0 && Int128::maximumValue() / POWERS_OF_TEN[step] < value) { - overflow = true; - return Int128::maximumValue(); - } else if (value < 0 && Int128::minimumValue() / POWERS_OF_TEN[step] > value) { - overflow = true; - return Int128::minimumValue(); - } - - value *= POWERS_OF_TEN[step]; - scale -= step; - } - - return value; - } - - Int128 downScaleInt128(Int128 value, int32_t scale) { - while (scale > 0) { - int32_t step = std::min(std::abs(scale), MAX_PRECISION_64); - value /= POWERS_OF_TEN[step]; - scale -= step; - } - return value; - } - - int32_t decimalCompare(Int128 lValue, int32_t lScale, - Int128 rValue, int32_t rScale) { - // compare integral parts - Int128 lIntegral = downScaleInt128(lValue, lScale); - Int128 rIntegral = downScaleInt128(rValue, rScale); - - if (lIntegral < rIntegral) { - return -1; - } else if (lIntegral > rIntegral) { - return 1; - } - - // integral parts are equal, continue comparing fractional parts - // unneccessary to check overflow here because the scaled number will not - // exceed original ones - bool overflow = false, positive = lValue >= 0; - lValue -= scaleInt128(lIntegral, lScale, overflow); - rValue -= scaleInt128(rIntegral, rScale, overflow); - - int32_t diff = lScale - rScale; - if (diff > 0) { - rValue = scaleInt128(rValue, diff, overflow); - if (overflow) { - return positive ? -1 : 1; - } - } else { - lValue = scaleInt128(lValue, -diff, overflow); - if (overflow) { - return positive ? 1 : -1; - } - } - - if (lValue < rValue) { - return -1; - } else if (lValue > rValue) { - return 1; - } else { - return 0; - } - } - } diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 83788c98ab..083b722a44 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -146,94 +146,87 @@ namespace orc { } ColumnStatisticsImpl::ColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { + (const proto::ColumnStatistics& pb) { valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_binarystatistics() || !correctStats) { - hasTotalLengthValue = false; + _hasTotalLength = false; + totalLength = 0; }else{ - hasTotalLengthValue = pb.binarystatistics().has_sum(); + _hasTotalLength = pb.binarystatistics().has_sum(); totalLength = static_cast(pb.binarystatistics().sum()); } } BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_bucketstatistics() || !correctStats) { - hasCountValue = false; + _hasCount = false; trueCount = 0; }else{ - hasCountValue = true; + _hasCount = true; trueCount = pb.bucketstatistics().count(0); } } DateColumnStatisticsImpl::DateColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_datestatistics() || !correctStats) { - hasMinimumValue = false; - hasMaximumValue = false; + _hasMinimum = false; + _hasMaximum = false; minimum = 0; maximum = 0; } else { - hasMinimumValue = pb.datestatistics().has_minimum(); - hasMaximumValue = pb.datestatistics().has_maximum(); + _hasMinimum = pb.datestatistics().has_minimum(); + _hasMaximum = pb.datestatistics().has_maximum(); minimum = pb.datestatistics().minimum(); maximum = pb.datestatistics().maximum(); } } DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats): minimum(0, 0), - maximum(0, 0), - sum(0, 0){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_decimalstatistics() || !correctStats) { - hasMinimumValue = false; - hasMaximumValue = false; - hasSumValue = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; }else{ const proto::DecimalStatistics& stats = pb.decimalstatistics(); - hasMinimumValue = stats.has_minimum(); - hasMaximumValue = stats.has_maximum(); - hasSumValue = stats.has_sum(); + _hasMinimum = stats.has_minimum(); + _hasMaximum = stats.has_maximum(); + _hasSum = stats.has_sum(); - minimum = Decimal(stats.minimum()); - maximum = Decimal(stats.maximum()); - sum = Decimal(stats.sum()); + minimum = stats.minimum(); + maximum = stats.maximum(); + sum = stats.sum(); } } DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ + (const proto::ColumnStatistics& pb){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_doublestatistics()) { - hasMinimumValue = false; - hasMaximumValue = false; - hasSumValue = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; minimum = 0; maximum = 0; sum = 0; }else{ const proto::DoubleStatistics& stats = pb.doublestatistics(); - hasMinimumValue = stats.has_minimum(); - hasMaximumValue = stats.has_maximum(); - hasSumValue = stats.has_sum(); + _hasMinimum = stats.has_minimum(); + _hasMaximum = stats.has_maximum(); + _hasSum = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -242,22 +235,21 @@ namespace orc { } IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ + (const proto::ColumnStatistics& pb){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_intstatistics()) { - hasMinimumValue = false; - hasMaximumValue = false; - hasSumValue = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; minimum = 0; maximum = 0; sum = 0; }else{ const proto::IntegerStatistics& stats = pb.intstatistics(); - hasMinimumValue = stats.has_minimum(); - hasMaximumValue = stats.has_maximum(); - hasSumValue = stats.has_sum(); + _hasMinimum = stats.has_minimum(); + _hasMaximum = stats.has_maximum(); + _hasSum = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -266,20 +258,19 @@ namespace orc { } StringColumnStatisticsImpl::StringColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats){ + (const proto::ColumnStatistics& pb, bool correctStats){ valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_stringstatistics() || !correctStats) { - hasMinimumValue = false; - hasMaximumValue = false; - hasTotalLen = false; + _hasMinimum = false; + _hasMaximum = false; + _hasTotalLength = false; totalLength = 0; }else{ const proto::StringStatistics& stats = pb.stringstatistics(); - hasMinimumValue = stats.has_minimum(); - hasMaximumValue = stats.has_maximum(); - hasTotalLen = stats.has_sum(); + _hasMinimum = stats.has_minimum(); + _hasMaximum = stats.has_maximum(); + _hasTotalLength = stats.has_sum(); minimum = stats.minimum(); maximum = stats.maximum(); @@ -288,67 +279,21 @@ namespace orc { } TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl - (const proto::ColumnStatistics& pb, bool correctStats) { + (const proto::ColumnStatistics& pb, bool correctStats) { valueCount = pb.numberofvalues(); - hasNullValue = pb.hasnull(); if (!pb.has_timestampstatistics() || !correctStats) { - hasMinimumValue = false; - hasMaximumValue = false; + _hasMinimum = false; + _hasMaximum = false; minimum = 0; maximum = 0; }else{ const proto::TimestampStatistics& stats = pb.timestampstatistics(); - hasMinimumValue = stats.has_minimum(); - hasMaximumValue = stats.has_maximum(); + _hasMinimum = stats.has_minimum(); + _hasMaximum = stats.has_maximum(); minimum = stats.minimum(); maximum = stats.maximum(); } } - std::unique_ptr createColumnStatistics( - const Type& type, bool enableStrCmp) { - switch (static_cast(type.getKind())) { - case BOOLEAN: - return std::unique_ptr( - new BooleanColumnStatisticsImpl()); - case BYTE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr( - new IntegerColumnStatisticsImpl()); - case STRUCT: - case MAP: - case LIST: - case UNION: - return std::unique_ptr( - new ColumnStatisticsImpl()); - case FLOAT: - case DOUBLE: - return std::unique_ptr( - new DoubleColumnStatisticsImpl()); - case BINARY: - return std::unique_ptr( - new BinaryColumnStatisticsImpl()); - case STRING: - case CHAR: - case VARCHAR: - return std::unique_ptr( - new StringColumnStatisticsImpl(enableStrCmp)); - case DATE: - return std::unique_ptr( - new DateColumnStatisticsImpl()); - case TIMESTAMP: - return std::unique_ptr( - new TimestampColumnStatisticsImpl()); - case DECIMAL: - return std::unique_ptr( - new DecimalColumnStatisticsImpl()); - default: - throw NotImplementedYet("Not supported type " + type.toString() + - " for ColumnStatistics"); - } - } - }// namespace diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index c44cffcd8a..1d1006ea2c 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -32,87 +32,55 @@ namespace orc { */ class ColumnStatisticsImpl: public ColumnStatistics { + private: + uint64_t valueCount; + public: - ColumnStatisticsImpl() { - reset(); - } ColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~ColumnStatisticsImpl(); + uint64_t getNumberOfValues() const override { + return valueCount; + } + std::string toString() const override { std::ostringstream buffer; - buffer << "Column has " << valueCount << " values" - << " and has null value: " << (hasNullValue ? "yes " : "no") - << std::endl; + buffer << "Column has " << valueCount << " values" << std::endl; return buffer.str(); } }; class BinaryColumnStatisticsImpl: public BinaryColumnStatistics { private: - bool hasTotalLengthValue; + bool _hasTotalLength; + uint64_t valueCount; uint64_t totalLength; public: - BinaryColumnStatisticsImpl() { - reset(); - } BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~BinaryColumnStatisticsImpl(); bool hasTotalLength() const override { - return hasTotalLengthValue; + return _hasTotalLength; } - - void setHasTotalLength(bool newHasTotalLength) override { - hasTotalLengthValue = newHasTotalLength; + uint64_t getNumberOfValues() const override { + return valueCount; } uint64_t getTotalLength() const override { - if(hasTotalLengthValue){ + if(_hasTotalLength){ return totalLength; }else{ throw ParseError("Total length is not defined."); } } - void setTotalLength(uint64_t length) override { - this->totalLength = length; - } - - void reset() override { - ColumnStatistics::reset(); - hasTotalLengthValue = false; - totalLength = 0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const BinaryColumnStatistics& binColStats = - dynamic_cast(other); - - totalLength += binColStats.getTotalLength(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); - binStats->set_sum(static_cast(totalLength)); - } - - void update(const char*, size_t length) override { - totalLength += length; - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Binary" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasTotalLengthValue){ + << "Values: " << valueCount << std::endl; + if(_hasTotalLength){ buffer << "Total length: " << totalLength << std::endl; }else{ buffer << "Total length: not defined" << std::endl; @@ -123,23 +91,24 @@ namespace orc { class BooleanColumnStatisticsImpl: public BooleanColumnStatistics { private: - bool hasCountValue; + bool _hasCount; + uint64_t valueCount; uint64_t trueCount; public: - BooleanColumnStatisticsImpl() { - reset(); - } - BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, - bool correctStats); + BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~BooleanColumnStatisticsImpl(); bool hasCount() const override { - return hasCountValue; + return _hasCount; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } uint64_t getFalseCount() const override { - if(hasCountValue){ + if(_hasCount){ return valueCount - trueCount; }else{ throw ParseError("False count is not defined."); @@ -147,57 +116,18 @@ namespace orc { } uint64_t getTrueCount() const override { - if(hasCountValue){ + if(_hasCount){ return trueCount; }else{ throw ParseError("True count is not defined."); } } - virtual void setTrueCount(uint64_t count) override { - this->trueCount = count; - } - - void setHasCount(bool hasCount) override { - this->hasCountValue = hasCount; - } - - void reset() override { - ColumnStatistics::reset(); - hasCountValue = true; - trueCount = 0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const BooleanColumnStatistics& boolStats = - dynamic_cast(other); - - trueCount += boolStats.getTrueCount(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); - if (hasCountValue) { - bucketStats->add_count(trueCount); - } - } - - void update(bool value, size_t repetitions) override { - if (value) { - trueCount += repetitions; - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Boolean" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasCountValue){ + << "Values: " << valueCount << std::endl; + if(_hasCount){ buffer << "(true: " << trueCount << "; false: " << valueCount - trueCount << ")" << std::endl; } else { @@ -210,29 +140,30 @@ namespace orc { class DateColumnStatisticsImpl: public DateColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; + bool _hasMinimum; + bool _hasMaximum; + uint64_t valueCount; int32_t minimum; int32_t maximum; public: - DateColumnStatisticsImpl() { - reset(); - } - DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, - bool correctStats); + DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~DateColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } int32_t getMinimum() const override { - if(hasMinimumValue){ + if(_hasMinimum){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -240,86 +171,24 @@ namespace orc { } int32_t getMaximum() const override { - if(hasMaximumValue){ + if(_hasMaximum){ return maximum; }else{ throw ParseError("Maximum is not defined."); } } - void setMinimum(int32_t min) override { - this->minimum = min; - this->hasMinimumValue = true; - } - - void setMaximum(int32_t max) override { - this->maximum = max; - this->hasMaximumValue = true; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - minimum = std::numeric_limits::min(); - maximum = std::numeric_limits::max(); - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const DateColumnStatistics& dateStats = - dynamic_cast(other); - - if (dateStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = dateStats.getMinimum(); - maximum = dateStats.getMaximum(); - } else { - if (dateStats.getMaximum() > maximum) { - maximum = dateStats.getMaximum(); - } - if (dateStats.getMinimum() < minimum) { - minimum = dateStats.getMinimum(); - } - } - } - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - if (hasMinimumValue) { - proto::DateStatistics* dateStatistics = pbStats.mutable_datestatistics(); - dateStatistics->set_maximum(maximum); - dateStatistics->set_minimum(minimum); - } - } - - void update(int32_t value) override { - if (!hasMinimumValue) { - maximum = minimum = value; - hasMaximumValue = hasMinimumValue = true; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Date" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasMinimumValue){ + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(hasMaximumValue){ + if(_hasMaximum){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; @@ -330,246 +199,116 @@ namespace orc { class DecimalColumnStatisticsImpl: public DecimalColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; - bool hasSumValue; - Decimal minimum; - Decimal maximum; - Decimal sum; + bool _hasMinimum; + bool _hasMaximum; + bool _hasSum; + uint64_t valueCount; + std::string minimum; + std::string maximum; + std::string sum; public: - DecimalColumnStatisticsImpl(): minimum(0, 0), - maximum(0, 0), - sum(0, 0) { - reset(); - } - DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, - bool correctStats); + DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~DecimalColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; } bool hasSum() const override { - return hasSumValue; + return _hasSum; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } Decimal getMinimum() const override { - if(hasMinimumValue){ - return minimum; + if(_hasMinimum){ + return Decimal(minimum); }else{ throw ParseError("Minimum is not defined."); } } Decimal getMaximum() const override { - if(hasMaximumValue){ - return maximum; + if(_hasMaximum){ + return Decimal(maximum); }else{ throw ParseError("Maximum is not defined."); } } Decimal getSum() const override { - if(hasSumValue){ - return sum; + if(_hasSum){ + return Decimal(sum); }else{ throw ParseError("Sum is not defined."); } } - void setMinimum(Decimal min) override { - this->hasMinimumValue = true; - minimum = min; - } - - void setMaximum(Decimal max) override { - this->hasMaximumValue = true; - maximum = max; - } - - void setSum(Decimal newSum) override { - this->hasSumValue = true; - sum = newSum; - } - - void setHasSum(bool hasSum) override { - this->hasSumValue = hasSum; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - hasSumValue = true; - maximum = Decimal(0, 0); - minimum = Decimal(0, 0); - sum = Decimal(0, 0); - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const DecimalColumnStatistics& decStats = - dynamic_cast(other); - - if (decStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = decStats.getMinimum(); - maximum = decStats.getMaximum(); - } else { - if (decimalCompare(maximum.value, - maximum.scale, - decStats.getMaximum().value, - decStats.getMaximum().scale) < 0) { - maximum = decStats.getMaximum(); - } - if (decimalCompare(minimum.value, - minimum.scale, - decStats.getMinimum().value, - decStats.getMinimum().scale) > 0) { - minimum = decStats.getMinimum(); - } - } - } - - // hasSumValue here means no overflow - hasSumValue &= decStats.hasSum(); - if (hasSumValue) { - updateSum(decStats.getSum().value, decStats.getSum().scale); - } - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); - if (hasMinimumValue) { - decStats->set_minimum(minimum.toString()); - decStats->set_maximum(maximum.toString()); - } - if (hasSumValue) { - decStats->set_sum(sum.toString()); - } - } - - void update(const Decimal& decimal) override { - update(decimal.value, decimal.scale); - } - - void update(int64_t value, int32_t scale) override { - update(Int128(value), scale); - } - - void update(Int128 value, int32_t scale) override { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = maximum = Decimal(value, scale); - } else { - if (decimalCompare(value, - scale, - minimum.value, - minimum.scale) < 0) { - minimum = Decimal(value, scale); - } else if (decimalCompare(maximum.value, - maximum.scale, - value, - scale) < 0) { - maximum = Decimal(value, scale); - } - } - - if (hasSumValue) { - updateSum(value, scale); - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Decimal" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasMinimumValue){ - buffer << "Minimum: " << minimum.toString() << std::endl; + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ + buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(hasMaximumValue){ - buffer << "Maximum: " << maximum.toString() << std::endl; + if(_hasMaximum){ + buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(hasSumValue){ - buffer << "Sum: " << sum.toString() << std::endl; + if(_hasSum){ + buffer << "Sum: " << sum << std::endl; }else{ buffer << "Sum: not defined" << std::endl; } return buffer.str(); } - - private: - void updateSum(Int128 value, int32_t scale) { - if (hasSumValue) { - bool overflow = false; - if (sum.scale > scale) { - value = scaleInt128(value, sum.scale - scale, overflow); - } else if (sum.scale < scale) { - sum.value = scaleInt128(sum.value, scale - sum.scale, overflow); - sum.scale = scale; - } - - if (!overflow) { - bool wasPositive = sum.value >= 0; - sum.value += value; - if ((value >= 0) == wasPositive) { - hasSumValue = (sum.value >= 0) == wasPositive; - } - } else { - hasSumValue = false; - } - } - } }; class DoubleColumnStatisticsImpl: public DoubleColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; - bool hasSumValue; + bool _hasMinimum; + bool _hasMaximum; + bool _hasSum; + uint64_t valueCount; double minimum; double maximum; double sum; public: - DoubleColumnStatisticsImpl() { - reset(); - } DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~DoubleColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; } bool hasSum() const override { - return hasSumValue; + return _hasSum; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } double getMinimum() const override { - if(hasMinimumValue){ + if(_hasMinimum){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -577,7 +316,7 @@ namespace orc { } double getMaximum() const override { - if(hasMaximumValue){ + if(_hasMaximum){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -585,109 +324,30 @@ namespace orc { } double getSum() const override { - if(hasSumValue){ + if(_hasSum){ return sum; }else{ throw ParseError("Sum is not defined."); } } - void setMinimum(double min) override { - this->minimum = min; - this->hasMinimumValue = true; - } - - void setMaximum(double max) override { - this->maximum = max; - this->hasMaximumValue = true; - } - - void setSum(double newSum) override { - this->sum = newSum; - this->hasSumValue = true; - } - - void setHasSum(bool hasSum) override { - this->hasSumValue = hasSum; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - minimum = std::numeric_limits::min(); - maximum = std::numeric_limits::max(); - hasSumValue = true; - sum = 0.0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const DoubleColumnStatistics& doubleColStats = - dynamic_cast(other); - - if (doubleColStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = doubleColStats.getMinimum(); - maximum = doubleColStats.getMaximum(); - } else { - if (doubleColStats.getMaximum() > maximum) { - maximum = doubleColStats.getMaximum(); - } - if (doubleColStats.getMinimum() < minimum) { - minimum = doubleColStats.getMinimum(); - } - } - } - - sum += doubleColStats.getSum(); - } - - void update(double value) override { - if (!hasMinimumValue) { - maximum = minimum = value; - hasMaximumValue = hasMinimumValue = true; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - sum += value; - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); - if (hasMinimumValue) { - doubleStats->set_minimum(minimum); - doubleStats->set_maximum(maximum); - } - if (hasSumValue) { - doubleStats->set_sum(sum); - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Double" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasMinimumValue){ + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(hasMaximumValue){ + if(_hasMaximum){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(hasSumValue){ + if(_hasSum){ buffer << "Sum: " << sum << std::endl; }else{ buffer << "Sum: not defined" << std::endl; @@ -698,34 +358,36 @@ namespace orc { class IntegerColumnStatisticsImpl: public IntegerColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; - bool hasSumValue; + bool _hasMinimum; + bool _hasMaximum; + bool _hasSum; + uint64_t valueCount; int64_t minimum; int64_t maximum; int64_t sum; public: - IntegerColumnStatisticsImpl() { - reset(); - } IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~IntegerColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; } bool hasSum() const override { - return hasSumValue; + return _hasSum; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } int64_t getMinimum() const override { - if(hasMinimumValue){ + if(_hasMinimum){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -733,7 +395,7 @@ namespace orc { } int64_t getMaximum() const override { - if(hasMaximumValue){ + if(_hasMaximum){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -741,124 +403,30 @@ namespace orc { } int64_t getSum() const override { - if(hasSumValue){ + if(_hasSum){ return sum; }else{ throw ParseError("Sum is not defined."); } } - void setMinimum(int64_t min) override { - this->minimum = min; - this->hasMinimumValue = true; - } - - void setMaximum(int64_t max) override { - this->maximum = max; - this->hasMaximumValue = true; - } - - void setSum(int64_t newSum) override { - this->sum = newSum; - this->hasSumValue = true; - } - - void setHasSum(bool hasSum) override { - this->hasSumValue = hasSum; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - minimum = std::numeric_limits::min(); - maximum = std::numeric_limits::max(); - hasSumValue = true; - sum = 0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const IntegerColumnStatistics& intColStats = - dynamic_cast(other); - - if (intColStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = intColStats.getMinimum(); - maximum = intColStats.getMaximum(); - } else { - if (intColStats.getMaximum() > maximum) { - maximum = intColStats.getMaximum(); - } - if (intColStats.getMinimum() < minimum) { - minimum = intColStats.getMinimum(); - } - } - } - - // hasSumValue here means no overflow - hasSumValue &= intColStats.hasSum(); - if (hasSumValue) { - bool wasPositive = sum >= 0; - sum += intColStats.getSum(); - if ((intColStats.getSum() >= 0) == wasPositive) { - hasSumValue = (sum >= 0) == wasPositive; - } - } - } - - void update(int64_t value, int repetitions) override { - if (!hasMinimumValue) { - maximum = minimum = value; - hasMaximumValue = hasMinimumValue = true; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - if (hasSumValue) { - bool wasPositive = sum >= 0; - sum += value * repetitions; - if ((value >= 0) == wasPositive) { - hasSumValue = (sum >= 0) == wasPositive; - } - } - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); - if (hasMinimumValue) { - intStats->set_minimum(minimum); - intStats->set_maximum(maximum); - } - if (hasSumValue) { - intStats->set_sum(sum); - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Integer" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - - if(hasMinimumValue){ + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum: not defined" << std::endl; } - if(hasMaximumValue){ + if(_hasMaximum){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum: not defined" << std::endl; } - if(hasSumValue){ + if(_hasSum){ buffer << "Sum: " << sum << std::endl; }else{ buffer << "Sum: not defined" << std::endl; @@ -867,39 +435,38 @@ namespace orc { } }; - class StringColumnStatisticsImpl: public StringColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; - bool hasTotalLen; + bool _hasMinimum; + bool _hasMaximum; + bool _hasTotalLength; + uint64_t valueCount; std::string minimum; std::string maximum; uint64_t totalLength; public: - StringColumnStatisticsImpl(bool enableStrComparision) { - enableStringComparison = enableStrComparision; - reset(); - } - StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, - bool correctStats); + StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~StringColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; } bool hasTotalLength() const override { - return hasTotalLen; + return _hasTotalLength; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } std::string getMinimum() const override { - if(hasMinimumValue){ + if(_hasMinimum){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -907,7 +474,7 @@ namespace orc { } std::string getMaximum() const override { - if(hasMaximumValue){ + if(_hasMaximum){ return maximum; }else{ throw ParseError("Maximum is not defined."); @@ -915,178 +482,65 @@ namespace orc { } uint64_t getTotalLength() const override { - if(hasTotalLen){ + if(_hasTotalLength){ return totalLength; }else{ throw ParseError("Total length is not defined."); } } - void setMinimum(std::string min) override { - this->minimum = min; - this->hasMinimumValue = true; - } - - void setMaximum(std::string max) override { - this->maximum = max; - this->hasMaximumValue = true; - } - - void setTotalLength(uint64_t newTotalLength) override { - this->totalLength = newTotalLength; - this->hasTotalLen = true; - } - - void setHasTotalLength(bool newHasTotalLength) override { - this->hasTotalLen = newHasTotalLength; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - minimum = std::string(); - maximum = std::string(); - hasTotalLen = true; - totalLength = 0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const StringColumnStatistics& strColStats = - dynamic_cast(other); - - if (enableStringComparison) { - if (strColStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = strColStats.getMinimum(); - maximum = strColStats.getMaximum(); - } else { - if (strColStats.getMaximum() > maximum) { - maximum = strColStats.getMaximum(); - } - if (strColStats.getMinimum() < minimum) { - minimum = strColStats.getMinimum(); - } - } - } - } - - totalLength += strColStats.getTotalLength(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); - if (hasMinimumValue) { - strStats->set_minimum(minimum); - strStats->set_maximum(maximum); - } - - strStats->set_sum(static_cast(totalLength)); - } - - void update(const std::string& value) override { - if (enableStringComparison) { - if (!hasMinimumValue) { - maximum = minimum = value; - hasMaximumValue = hasMinimumValue = true; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - } - - totalLength += value.length(); - } - - void update(const char* value, size_t length) override { - if (enableStringComparison && value != nullptr) { - if (!hasMinimumValue) { - maximum = minimum = std::string(value, value + length); - hasMaximumValue = hasMinimumValue = true; - } else { - // update min - int minCmp = strncmp(minimum.c_str(), - value, - std::min(minimum.length(), length)); - if (minCmp > 0 || (minCmp == 0 && length < minimum.length())) { - minimum = std::string(value, value + length); - } - - // update max - int maxCmp = strncmp(maximum.c_str(), - value, - std::min(maximum.length(), length)); - if (maxCmp < 0 || (maxCmp == 0 && length > minimum.length())) { - maximum = std::string(value, value + length); - } - } - } - - totalLength += length; - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: String" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasMinimumValue){ + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum is not defined" << std::endl; } - if(hasMaximumValue){ + if(_hasMaximum){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum is not defined" << std::endl; } - if(hasTotalLen){ + if(_hasTotalLength){ buffer << "Total length: " << totalLength << std::endl; }else{ buffer << "Total length is not defined" << std::endl; } return buffer.str(); } - - private: - // a flag to enable string comparision for min/max as it is very - // time-consuming, can be off by default - bool enableStringComparison; }; class TimestampColumnStatisticsImpl: public TimestampColumnStatistics { private: - bool hasMinimumValue; - bool hasMaximumValue; + bool _hasMinimum; + bool _hasMaximum; + uint64_t valueCount; int64_t minimum; int64_t maximum; public: - TimestampColumnStatisticsImpl() { - reset(); - } TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, bool correctStats); virtual ~TimestampColumnStatisticsImpl(); bool hasMinimum() const override { - return hasMinimumValue; + return _hasMinimum; } bool hasMaximum() const override { - return hasMaximumValue; + return _hasMaximum; + } + + uint64_t getNumberOfValues() const override { + return valueCount; } int64_t getMinimum() const override { - if(hasMinimumValue){ + if(_hasMinimum){ return minimum; }else{ throw ParseError("Minimum is not defined."); @@ -1094,89 +548,24 @@ namespace orc { } int64_t getMaximum() const override { - if(hasMaximumValue){ + if(_hasMaximum){ return maximum; }else{ throw ParseError("Maximum is not defined."); } } - void setMinimum(int64_t min) override { - this->minimum = min; - this->hasMinimumValue = true; - } - - void setMaximum(int64_t max) override { - this->maximum = max; - this->hasMaximumValue = true; - } - - void reset() override { - ColumnStatistics::reset(); - hasMinimumValue = false; - hasMaximumValue = false; - minimum = 0; - maximum = 0; - } - - void merge(const ColumnStatistics& other) override { - ColumnStatistics::merge(other); - - const TimestampColumnStatistics& tsStats = - dynamic_cast(other); - - if (tsStats.hasMinimum()) { - if (!hasMinimumValue) { - hasMinimumValue = hasMaximumValue = true; - minimum = tsStats.getMinimum(); - maximum = tsStats.getMaximum(); - } else { - if (tsStats.getMaximum() > maximum) { - maximum = tsStats.getMaximum(); - } - if (tsStats.getMinimum() < minimum) { - minimum = tsStats.getMinimum(); - } - } - } - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - ColumnStatistics::toProtoBuf(pbStats); - - if (hasMinimumValue) { - proto::TimestampStatistics* timestampStatistics = - pbStats.mutable_timestampstatistics(); - - // ORC-135: min and max are deprecated, store UTC instead - timestampStatistics->set_maximumutc(maximum); - timestampStatistics->set_minimumutc(minimum); - } - } - - void update(int64_t value) override { - if (!hasMinimumValue) { - maximum = minimum = value; - hasMaximumValue = hasMinimumValue = true; - } else if (value < minimum) { - minimum = value; - } else if (value > maximum) { - maximum = value; - } - } - std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Timestamp" << std::endl - << "Values: " << valueCount << std::endl - << "Has null: " << (hasNullValue ? "yes" : "no") << std::endl; - if(hasMinimumValue){ + << "Values: " << valueCount << std::endl; + if(_hasMinimum){ buffer << "Minimum: " << minimum << std::endl; }else{ buffer << "Minimum is not defined" << std::endl; } - if(hasMaximumValue){ + if(_hasMaximum){ buffer << "Maximum: " << maximum << std::endl; }else{ buffer << "Maximum is not defined" << std::endl; @@ -1185,6 +574,7 @@ namespace orc { } }; + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, bool correctStats); From e84fa473a9d717a61409a4132cf311344ae18c64 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 27 Apr 2017 20:59:28 -0700 Subject: [PATCH 3/4] Remove buildTypeFromString --- c++/include/orc/Type.hh | 5 -- c++/src/TypeImpl.cc | 186 ---------------------------------------- c++/src/TypeImpl.hh | 5 -- 3 files changed, 196 deletions(-) diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 32cbbed99e..25b8f535c3 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -82,11 +82,6 @@ namespace orc { * @return a reference to the union type */ virtual Type* addUnionChild(ORC_UNIQUE_PTR fieldType) = 0; - - /** - * Build a Type object from string text representation. - */ - static Type* buildTypeFromString(const std::string& input); }; const int64_t DEFAULT_DECIMAL_SCALE = 18; diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index 99a25812c9..fdf66a0f33 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -491,190 +491,4 @@ namespace orc { return std::unique_ptr(result); } - Type * Type::buildTypeFromString(const std::string& input) { - std::vector > res = - TypeImpl::buildTypeFromStringImpl(input, 0, input.size()); - if (res.size() != 1) { - throw std::logic_error("Invalid type string."); - } - return res[0].second; - } - - std::vector > - TypeImpl::buildTypeFromStringImpl(const std::string& input, - size_t start, - size_t end) { - std::string types = input.substr(start, end - start); - std::vector > res; - size_t pos = 0; - - while (pos < types.size()) { - size_t endPos = pos; - while (endPos < types.size() && - ((types[endPos] >= 'a' && types[endPos] <= 'z') || - (types[endPos] >= '0' && types[endPos] <= '9'))) { - ++endPos; - } - - std::string fieldName; - if (types[endPos] == ':') { - fieldName = types.substr(pos, endPos - pos); - pos = ++endPos; - while (endPos < types.size() && types[endPos] >= 'a' - && types[endPos] <= 'z') { - ++endPos; - } - } - - size_t nextPos = endPos + 1; - if (types[endPos] == '<') { - int count = 1; - while (nextPos < types.size()) { - if (types[nextPos] == '<') { - ++count; - } else if (types[nextPos] == '>') { - --count; - } - if (count == 0) { - break; - } - ++nextPos; - } - if (nextPos == types.size()) { - throw std::logic_error("Invalid type string. Cannot find closing >"); - } - } else if (types[endPos] == '(') { - while (nextPos < types.size() && types[nextPos] != ')') { - ++nextPos; - } - if (nextPos == types.size()) { - throw std::logic_error("Invalid type string. Cannot find closing )"); - } - } else if (types[endPos] != ',' && types[endPos] != '\0') { - throw std::logic_error("Unrecognize character."); - } - - std::string typeName = types.substr(pos, endPos - pos); - if (typeName == "boolean") { - res.push_back(std::make_pair(fieldName, new TypeImpl(BOOLEAN))); - } else if (typeName == "tinyint") { - res.push_back(std::make_pair(fieldName, new TypeImpl(BYTE))); - } else if (typeName == "smallint") { - res.push_back(std::make_pair(fieldName, new TypeImpl(SHORT))); - } else if (typeName == "int") { - res.push_back(std::make_pair(fieldName, new TypeImpl(INT))); - } else if (typeName == "bigint") { - res.push_back(std::make_pair(fieldName, new TypeImpl(LONG))); - } else if (typeName == "float") { - res.push_back(std::make_pair(fieldName, new TypeImpl(FLOAT))); - } else if (typeName == "double") { - res.push_back(std::make_pair(fieldName, new TypeImpl(DOUBLE))); - } else if (typeName == "string") { - res.push_back(std::make_pair(fieldName, new TypeImpl(STRING))); - } else if (typeName == "binary") { - res.push_back(std::make_pair(fieldName, new TypeImpl(BINARY))); - } else if (typeName == "timestamp") { - res.push_back(std::make_pair(fieldName, new TypeImpl(TIMESTAMP))); - } else if (typeName == "array") { - TypeImpl * arrayType = new TypeImpl(LIST); - std::vector > v = - TypeImpl::buildTypeFromStringImpl( - types, - endPos + 1, - nextPos); - if (v.size() != 1) { - throw std::logic_error( - "Array type must contain exactly one sub type."); - } - arrayType->addChildType(std::unique_ptr(v[0].second)); - res.push_back(std::make_pair(fieldName, arrayType)); - } else if (typeName == "map") { - TypeImpl * mapType = new TypeImpl(MAP); - std::vector > v = - TypeImpl::buildTypeFromStringImpl( - types, - endPos + 1, - nextPos); - if (v.size() != 2) { - throw std::logic_error( - "Map type must contain exactly two sub types."); - } - mapType->addChildType(std::unique_ptr(v[0].second)); - mapType->addChildType(std::unique_ptr(v[1].second)); - res.push_back(std::make_pair(fieldName, mapType)); - } else if (typeName == "struct") { - TypeImpl * structType = new TypeImpl(STRUCT); - std::vector > v = - TypeImpl::buildTypeFromStringImpl( - types, - endPos + 1, - nextPos); - if (v.size() == 0) { - throw std::logic_error( - "Struct type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - structType->addStructField( - v[i].first, - std::unique_ptr(v[i].second)); - } - res.push_back(std::make_pair(fieldName, structType)); - } else if (typeName == "uniontype") { - TypeImpl * unionType = new TypeImpl(UNION); - std::vector > v = - TypeImpl::buildTypeFromStringImpl( - types, - endPos + 1, - nextPos); - if (v.size() == 0) { - throw std::logic_error( - "Union type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - unionType->addChildType(std::unique_ptr(v[i].second)); - } - res.push_back(std::make_pair(fieldName, unionType)); - } else if (typeName == "decimal") { - size_t sep = types.find(',', endPos + 1); - if (sep + 1 >= nextPos || sep == std::string::npos) { - throw std::logic_error( - "Decimal type must specify precision and scale."); - } - uint64_t precision = - static_cast( - atoi( - types.substr(endPos + 1, sep - endPos - 1).c_str())); - uint64_t scale = - static_cast( - atoi(types.substr(sep + 1, nextPos - sep - 1).c_str())); - TypeImpl * decimalType = new TypeImpl(DECIMAL, precision, scale); - res.push_back(std::make_pair(fieldName, decimalType)); - } else if (typeName == "date") { - res.push_back(std::make_pair(fieldName, new TypeImpl(DATE))); - } else if (typeName == "varchar") { - uint64_t maxLength = static_cast( - atoi( - types.substr(endPos + 1, nextPos - endPos - 1).c_str())); - res.push_back( - std::make_pair(fieldName, new TypeImpl(VARCHAR, maxLength))); - } else if (typeName == "char") { - uint64_t maxLength = static_cast( - atoi( - types.substr(endPos + 1, nextPos - endPos - 1).c_str())); - res.push_back(std::make_pair(fieldName, new TypeImpl(CHAR, maxLength))); - } - else { - throw std::logic_error("Unknown type " + typeName); - } - - if (types[nextPos] == ')' || types[nextPos] == '>') { - pos = nextPos + 2; - } else { - pos = nextPos; - } - } - - return res; - } - } diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 7d7577fd07..e2866e456d 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -98,11 +98,6 @@ namespace orc { */ void addChildType(std::unique_ptr childType); - static std::vector > buildTypeFromStringImpl( - const std::string& input, - size_t start, - size_t end); - private: /** * Assign ids to this node and its children giving this From c584e4a6d4eb6c31a915380825fb73767924eab5 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 2 May 2017 10:51:12 -0700 Subject: [PATCH 4/4] Renamed InputStream and moved to src/io --- c++/src/CMakeLists.txt | 2 +- c++/src/Compression.hh | 2 +- c++/src/{Stream.cc => io/InputStream.cc} | 24 ++++++++++++------------ c++/src/{Stream.hh => io/InputStream.hh} | 6 +++--- 4 files changed, 17 insertions(+), 17 deletions(-) rename c++/src/{Stream.cc => io/InputStream.cc} (90%) rename c++/src/{Stream.hh => io/InputStream.hh} (97%) diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 51bdf1b5dc..c39437a5f2 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -135,6 +135,7 @@ add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc add_library (orc STATIC "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" orc_proto.pb.h + io/InputStream.cc wrap/orc-proto-wrapper.cc ByteRLE.cc ColumnPrinter.cc @@ -151,7 +152,6 @@ add_library (orc STATIC RLEv2.cc RLE.cc Statistics.cc - Stream.cc StripeStream.cc Timezone.cc TypeImpl.cc diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh index c02c9a4edc..8c3eda7782 100644 --- a/c++/src/Compression.hh +++ b/c++/src/Compression.hh @@ -19,7 +19,7 @@ #ifndef ORC_COMPRESSION_HH #define ORC_COMPRESSION_HH -#include "Stream.hh" +#include "io/InputStream.hh" namespace orc { diff --git a/c++/src/Stream.cc b/c++/src/io/InputStream.cc similarity index 90% rename from c++/src/Stream.cc rename to c++/src/io/InputStream.cc index 9e4d0565c5..fd91b23e70 100644 --- a/c++/src/Stream.cc +++ b/c++/src/io/InputStream.cc @@ -17,7 +17,7 @@ */ #include "Exceptions.hh" -#include "Stream.hh" +#include "InputStream.hh" #include #include @@ -61,10 +61,10 @@ namespace orc { } SeekableArrayInputStream::SeekableArrayInputStream - (const unsigned char* values, - uint64_t size, - uint64_t blkSize - ): data(reinterpret_cast(values)) { + (const unsigned char* values, + uint64_t size, + uint64_t blkSize + ): data(reinterpret_cast(values)) { length = size; position = 0; blockSize = blkSize == 0 ? length : static_cast(blkSize); @@ -138,13 +138,13 @@ namespace orc { uint64_t byteCount, MemoryPool& _pool, uint64_t _blockSize - ):pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock - (_blockSize, - length)) { + ):pool(_pool), + input(stream), + start(offset), + length(byteCount), + blockSize(computeBlock + (_blockSize, + length)) { position = 0; buffer.reset(new DataBuffer(pool)); diff --git a/c++/src/Stream.hh b/c++/src/io/InputStream.hh similarity index 97% rename from c++/src/Stream.hh rename to c++/src/io/InputStream.hh index 368a2c298e..5ea8c7ea60 100644 --- a/c++/src/Stream.hh +++ b/c++/src/io/InputStream.hh @@ -16,8 +16,8 @@ * limitations under the License. */ -#ifndef ORC_STREAM_HH -#define ORC_STREAM_HH +#ifndef ORC_INPUTSTREAM_HH +#define ORC_INPUTSTREAM_HH #include "Adaptor.hh" #include "orc/OrcFile.hh" @@ -113,4 +113,4 @@ namespace orc { } -#endif //ORC_STREAM_HH +#endif //ORC_INPUTSTREAM_HH