Skip to content

Commit

Permalink
GH-34351: [C++][Parquet] Statistics: add detail documentation and tin…
Browse files Browse the repository at this point in the history
…y optimization (#35989)

### Rationale for this change

### What changes are included in this PR?

1. This patch does some tiny optimizations on Parquet C++ Statistics. It does:

```
For min-max, using std::string. Because assume the case like that:
EncodedStatistics c1;
// do some operations
EncodedStatistics c2 = c1;
c2.set_max("dasdasdassd");
After c2 set, c1 would be set too. So I use std::string here.
```

2. Force clear ndv count during merging, and set `has_distinct_count_ = false`, and add some comments
3. Add some specification in Statistics API

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

* Closes: #34351

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: Gang Wu <ustcwg@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
4 people committed Jul 6, 2023
1 parent 88cb517 commit 60fdc25
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 30 deletions.
2 changes: 1 addition & 1 deletion c_glib/test/parquet/test-statistics.rb
Expand Up @@ -51,7 +51,7 @@ def setup

test("#has_n_distinct_values?") do
assert do
@statistics.has_n_distinct_values?
not @statistics.has_n_distinct_values?
end
end

Expand Down
59 changes: 43 additions & 16 deletions cpp/src/parquet/statistics.cc
Expand Up @@ -463,6 +463,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
public:
using T = typename DType::c_type;

// Create an empty stats.
TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
: descr_(descr),
pool_(pool),
Expand All @@ -471,35 +472,39 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
auto comp = Comparator::Make(descr);
comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
TypedStatisticsImpl::Reset();
has_null_count_ = true;
has_distinct_count_ = true;
}

// Create stats from provided values.
TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
int64_t distinct_count)
: pool_(default_memory_pool()),
min_buffer_(AllocateBuffer(pool_, 0)),
max_buffer_(AllocateBuffer(pool_, 0)) {
TypedStatisticsImpl::IncrementNumValues(num_values);
TypedStatisticsImpl::IncrementNullCount(null_count);
IncrementDistinctCount(distinct_count);
SetDistinctCount(distinct_count);

Copy(min, &min_, min_buffer_.get());
Copy(max, &max_, max_buffer_.get());
has_min_max_ = true;
}

// Create stats from a thrift Statistics object.
TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values,
int64_t null_count, int64_t distinct_count, bool has_min_max,
bool has_null_count, bool has_distinct_count, MemoryPool* pool)
: TypedStatisticsImpl(descr, pool) {
TypedStatisticsImpl::IncrementNumValues(num_values);
if (has_null_count_) {
if (has_null_count) {
TypedStatisticsImpl::IncrementNullCount(null_count);
} else {
has_null_count_ = false;
}
if (has_distinct_count) {
IncrementDistinctCount(distinct_count);
SetDistinctCount(distinct_count);
} else {
has_distinct_count_ = false;
}

if (!encoded_min.empty()) {
Expand Down Expand Up @@ -541,9 +546,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Reset() override {
ResetCounts();
has_min_max_ = false;
has_distinct_count_ = false;
has_null_count_ = false;
ResetHasFlags();
}

void SetMinMax(const T& arg_min, const T& arg_max) override {
Expand All @@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
// null_count is always valid when merging page statistics into
// column chunk statistics.
if (other.HasNullCount()) {
this->statistics_.null_count += other.null_count();
} else {
this->has_null_count_ = false;
}
if (other.HasDistinctCount()) {
this->statistics_.distinct_count += other.distinct_count();
}
// Clear has_distinct_count_ as distinct count cannot be merged.
has_distinct_count_ = false;
// Do not clear min/max here if the other side does not provide
// min/max which may happen when other is an empty stats or all
// its values are null and/or NaN.
if (other.HasMinMax()) {
SetMinMax(other.min(), other.max());
}
Expand Down Expand Up @@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
}
if (HasNullCount()) {
s.set_null_count(this->null_count());
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
}
// num_values_ is reliable and it means number of non-null values.
s.all_null_value = num_values_ == 0;
// TODO (GH-36505): distinct count is not encoded for now.
return s;
}

Expand All @@ -627,7 +637,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
T min_;
T max_;
::arrow::MemoryPool* pool_;
int64_t num_values_ = 0; // # of non-null values.
// Number of non-null values.
// Please note that num_values_ is reliable when has_null_count_ is set.
// When has_null_count_ is not set, e.g. a page statistics created from
// a statistics thrift message which doesn't have the optional null_count,
// `num_values_` may include null values.
int64_t num_values_ = 0;
EncodedStatistics statistics_;
std::shared_ptr<TypedComparator<DType>> comparator_;
std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
Expand All @@ -637,8 +652,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {

void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }

void IncrementDistinctCount(int64_t n) {
statistics_.distinct_count += n;
void SetDistinctCount(int64_t n) {
// distinct count can only be "set", and cannot be incremented.
statistics_.distinct_count = n;
has_distinct_count_ = true;
}

Expand All @@ -648,6 +664,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
this->num_values_ = 0;
}

void ResetHasFlags() {
// has_min_max_ will only be set when it meets any valid value.
this->has_min_max_ = false;
// has_distinct_count_ will only be set once SetDistinctCount()
// is called because distinct count calculation is not cheap and
// disabled by default.
this->has_distinct_count_ = false;
// Null count calculation is cheap and enabled by default.
this->has_null_count_ = true;
}

void SetMinMaxPair(std::pair<T, T> min_max) {
// CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
auto maybe_min_max = CleanStatistic(min_max);
Expand Down
27 changes: 14 additions & 13 deletions cpp/src/parquet/statistics.h
Expand Up @@ -117,17 +117,16 @@ std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* d
// ----------------------------------------------------------------------

/// \brief Structure represented encoded statistics to be written to
/// and from Parquet serialized metadata
/// and read from Parquet serialized metadata.
class PARQUET_EXPORT EncodedStatistics {
std::shared_ptr<std::string> max_, min_;
std::string max_, min_;
bool is_signed_ = false;

public:
EncodedStatistics()
: max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
EncodedStatistics() = default;

const std::string& max() const { return *max_; }
const std::string& min() const { return *min_; }
const std::string& max() const { return max_; }
const std::string& min() const { return min_; }

int64_t null_count = 0;
int64_t distinct_count = 0;
Expand All @@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics {
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
void ApplyStatSizeLimits(size_t length) {
if (max_->length() > length) {
if (max_.length() > length) {
has_max = false;
max_.clear();
}
if (min_->length() > length) {
if (min_.length() > length) {
has_min = false;
min_.clear();
}
}

Expand All @@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics {

void set_is_signed(bool is_signed) { is_signed_ = is_signed; }

EncodedStatistics& set_max(const std::string& value) {
*max_ = value;
EncodedStatistics& set_max(std::string value) {
max_ = std::move(value);
has_max = true;
return *this;
}

EncodedStatistics& set_min(const std::string& value) {
*min_ = value;
EncodedStatistics& set_min(std::string value) {
min_ = std::move(value);
has_min = true;
return *this;
}
Expand Down Expand Up @@ -329,7 +330,7 @@ class TypedStatistics : public Statistics {
/// null count is determined from the indices)
virtual void IncrementNullCount(int64_t n) = 0;

/// \brief Increments the number ov values directly
/// \brief Increments the number of values directly
/// The same note on IncrementNullCount applies here
virtual void IncrementNumValues(int64_t n) = 0;
};
Expand Down

0 comments on commit 60fdc25

Please sign in to comment.