Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34351: [C++][Parquet] Statistics: add detail documentation and tiny optimization #35989

Merged
merged 30 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
001948b
Parquet: Tiny Optimize logic for statistics
mapleFU Jun 8, 2023
8d52868
update PREDICT_TRUE when Merge
mapleFU Jun 8, 2023
fcda54d
Fixing case when not has_min_max_
mapleFU Jun 8, 2023
e4a2269
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 8, 2023
d4b0ae1
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 14, 2023
33dc32b
fix comment
mapleFU Jun 14, 2023
38bda57
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 17, 2023
8262868
fix part of comment
mapleFU Jun 17, 2023
a3e1a99
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 20, 2023
c8d3b58
fix comment
mapleFU Jun 20, 2023
c91d2b1
tiny update
mapleFU Jun 20, 2023
ba6428c
fix ruby test
mapleFU Jun 20, 2023
9bfdfb0
fix comment
mapleFU Jun 21, 2023
6f44bbe
tiny update
mapleFU Jun 21, 2023
8a35221
refactor the num_values and fix a bug in Reset
mapleFU Jun 21, 2023
6815749
resolve comment
mapleFU Jun 22, 2023
48d1563
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 24, 2023
a1a1c5e
add comments
mapleFU Jun 24, 2023
2892e45
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jun 26, 2023
d9d75f7
resolve comment
mapleFU Jun 26, 2023
a1cae5b
[add] add tests
mapleFU Jun 26, 2023
e90f209
fix comment
mapleFU Jun 29, 2023
2db4f0b
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jul 4, 2023
0f86ba0
Apply suggestions from code review
mapleFU Jul 4, 2023
5f3f546
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jul 5, 2023
7bf6901
fix comment
mapleFU Jul 5, 2023
e4d8003
Apply suggestions from code review
mapleFU Jul 5, 2023
20ffa2b
Merge branch 'main' into parquet/tiny-optimize-statistics
mapleFU Jul 6, 2023
bad5ba2
fix comment
mapleFU Jul 6, 2023
a57ee4a
Mention issue number
pitrou Jul 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c_glib/test/parquet/test-statistics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def setup

test("#has_n_distinct_values?") do
assert do
@statistics.has_n_distinct_values?
not @statistics.has_n_distinct_values?
end
end

Expand Down
59 changes: 43 additions & 16 deletions cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
public:
using T = typename DType::c_type;

// Create an empty stats.
TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
: descr_(descr),
pool_(pool),
Expand All @@ -471,35 +472,39 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
auto comp = Comparator::Make(descr);
comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
TypedStatisticsImpl::Reset();
has_null_count_ = true;
has_distinct_count_ = true;
}

// Create stats from provided values.
TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
int64_t distinct_count)
: pool_(default_memory_pool()),
min_buffer_(AllocateBuffer(pool_, 0)),
max_buffer_(AllocateBuffer(pool_, 0)) {
TypedStatisticsImpl::IncrementNumValues(num_values);
TypedStatisticsImpl::IncrementNullCount(null_count);
IncrementDistinctCount(distinct_count);
SetDistinctCount(distinct_count);

Copy(min, &min_, min_buffer_.get());
Copy(max, &max_, max_buffer_.get());
has_min_max_ = true;
}

// Create stats from a thrift Statistics object.
TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values,
int64_t null_count, int64_t distinct_count, bool has_min_max,
bool has_null_count, bool has_distinct_count, MemoryPool* pool)
: TypedStatisticsImpl(descr, pool) {
TypedStatisticsImpl::IncrementNumValues(num_values);
if (has_null_count_) {
if (has_null_count) {
TypedStatisticsImpl::IncrementNullCount(null_count);
} else {
has_null_count_ = false;
}
if (has_distinct_count) {
IncrementDistinctCount(distinct_count);
SetDistinctCount(distinct_count);
} else {
has_distinct_count_ = false;
}

if (!encoded_min.empty()) {
Expand Down Expand Up @@ -541,9 +546,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Reset() override {
ResetCounts();
has_min_max_ = false;
has_distinct_count_ = false;
has_null_count_ = false;
ResetHasFlags();
}

void SetMinMax(const T& arg_min, const T& arg_max) override {
Expand All @@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
// null_count is always valid when merging page statistics into
// column chunk statistics.
if (other.HasNullCount()) {
this->statistics_.null_count += other.null_count();
} else {
this->has_null_count_ = false;
}
if (other.HasDistinctCount()) {
this->statistics_.distinct_count += other.distinct_count();
}
// Clear has_distinct_count_ as distinct count cannot be merged.
has_distinct_count_ = false;
// Do not clear min/max here if the other side does not provide
// min/max which may happen when other is an empty stats or all
// its values are null and/or NaN.
if (other.HasMinMax()) {
SetMinMax(other.min(), other.max());
}
Expand Down Expand Up @@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
}
if (HasNullCount()) {
s.set_null_count(this->null_count());
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
}
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
// FIXME(mwish): distinct count is not encoded for now.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you open an issue for this? FIXMEs in the code will be forgotten...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created. But actually I found a lot of issues be forgotten #36505

pitrou marked this conversation as resolved.
Show resolved Hide resolved
return s;
}

Expand All @@ -627,7 +637,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
T min_;
T max_;
::arrow::MemoryPool* pool_;
int64_t num_values_ = 0; // # of non-null values.
// Number of non-null values.
// Please note that num_values_ is reliable when has_null_count_ is set.
// When has_null_count_ is not set, e.g. a page statistics created from
// a statistics thrift message which doesn't have the optional null_count,
// `num_values_` may include null values.
int64_t num_values_ = 0;
EncodedStatistics statistics_;
std::shared_ptr<TypedComparator<DType>> comparator_;
std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
Expand All @@ -637,8 +652,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }

void IncrementDistinctCount(int64_t n) {
statistics_.distinct_count += n;
void SetDistinctCount(int64_t n) {
// distinct count can only be "set", and cannot be incremented.
statistics_.distinct_count = n;
has_distinct_count_ = true;
}

Expand All @@ -648,6 +664,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
this->num_values_ = 0;
}

void ResetHasFlags() {
// has_min_max_ will only be set when it meets any valid value.
this->has_min_max_ = false;
// has_distinct_count_ will only be set once SetDistinctCount()
// is called because distinct count calculation is not cheap and
// disabled by default.
this->has_distinct_count_ = false;
// Null count calculation is cheap and enabled by default.
this->has_null_count_ = true;
}

void SetMinMaxPair(std::pair<T, T> min_max) {
// CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
auto maybe_min_max = CleanStatistic(min_max);
Expand Down
27 changes: 14 additions & 13 deletions cpp/src/parquet/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,16 @@ std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* d
// ----------------------------------------------------------------------

/// \brief Structure represented encoded statistics to be written to
/// and from Parquet serialized metadata
/// and read from Parquet serialized metadata.
class PARQUET_EXPORT EncodedStatistics {
std::shared_ptr<std::string> max_, min_;
std::string max_, min_;
bool is_signed_ = false;

public:
EncodedStatistics()
: max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
EncodedStatistics() = default;

const std::string& max() const { return *max_; }
const std::string& min() const { return *min_; }
const std::string& max() const { return max_; }
const std::string& min() const { return min_; }

int64_t null_count = 0;
int64_t distinct_count = 0;
Expand All @@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics {
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
void ApplyStatSizeLimits(size_t length) {
if (max_->length() > length) {
if (max_.length() > length) {
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
has_max = false;
max_.clear();
}
if (min_->length() > length) {
if (min_.length() > length) {
has_min = false;
min_.clear();
}
}

Expand All @@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics {

void set_is_signed(bool is_signed) { is_signed_ = is_signed; }

EncodedStatistics& set_max(const std::string& value) {
*max_ = value;
EncodedStatistics& set_max(std::string value) {
max_ = std::move(value);
has_max = true;
return *this;
}

EncodedStatistics& set_min(const std::string& value) {
*min_ = value;
EncodedStatistics& set_min(std::string value) {
min_ = std::move(value);
has_min = true;
return *this;
}
Expand Down Expand Up @@ -329,7 +330,7 @@ class TypedStatistics : public Statistics {
/// null count is determined from the indices)
virtual void IncrementNullCount(int64_t n) = 0;

/// \brief Increments the number ov values directly
/// \brief Increments the number of values directly
/// The same note on IncrementNullCount applies here
virtual void IncrementNumValues(int64_t n) = 0;
};
Expand Down
Loading
Loading