Address more review points

apache · Jun 21, 2023 · 22d54c2 · 22d54c2
1 parent af52e6e
commit 22d54c2
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 68 deletions.
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
@@ -214,6 +214,7 @@ set(ARROW_SRCS
     util/debug.cc
     util/decimal.cc
     util/delimiting.cc
+    util/float16.cc
     util/formatting.cc
     util/future.cc
     util/hashing.cc

diff --git a/cpp/src/arrow/util/float16.cc b/cpp/src/arrow/util/float16.cc
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ostream>
+
+#include "arrow/util/float16.h"
+
+namespace arrow {
+namespace util {
+
+std::ostream& operator<<(std::ostream& os, Float16Base arg) { return (os << arg.bits()); }
+
+}  // namespace util
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/float16.h b/cpp/src/arrow/util/float16.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include <array>
-#include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <iosfwd>
 #include <limits>
 #include <type_traits>
 
-#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/util/visibility.h"
 
@@ -45,38 +45,57 @@ class Float16Base {
   Float16Base() = default;
   constexpr explicit Float16Base(uint16_t value) : value_(value) {}
 
+  /// \brief Return the value's integer representation
   constexpr uint16_t bits() const { return value_; }
   constexpr explicit operator uint16_t() const { return bits(); }
 
+  /// \brief Return true if the value is negative (sign bit is set)
   constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
 
+  /// \brief Return true if the value is NaN
   constexpr bool is_nan() const {
     return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
   }
+  /// \brief Return true if the value is positive/negative infinity
   constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  /// \brief Return true if the value is positive/negative zero
   constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
 
   /// \brief Copy the value's bytes in native-endian byte order
   void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, sizeof(value_)); }
   /// \brief Return the value's bytes in native-endian byte order
-  std::array<uint8_t, 2> ToBytes() const {
-    std::array<uint8_t, 2> bytes;
-    ToBytes(bytes.data());
-    return bytes;
+  constexpr std::array<uint8_t, 2> ToBytes() const {
+#if ARROW_LITTLE_ENDIAN
+    return ToLittleEndian();
+#else
+    return ToBigEndian();
+#endif
   }
 
+  /// \brief Copy the value's bytes in little-endian byte order
   void ToLittleEndian(uint8_t* dest) const {
     Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
   }
-  std::array<uint8_t, 2> ToLittleEndian() const {
-    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  /// \brief Return the value's bytes in little-endian byte order
+  constexpr std::array<uint8_t, 2> ToLittleEndian() const {
+#if ARROW_LITTLE_ENDIAN
+    return {uint8_t(value_ & 0xff), uint8_t(value_ >> 8)};
+#else
+    return {uint8_t(value_ >> 8), uint8_t(value_ & 0xff)};
+#endif
   }
 
+  /// \brief Copy the value's bytes in big-endian byte order
   void ToBigEndian(uint8_t* dest) const {
     Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
   }
-  std::array<uint8_t, 2> ToBigEndian() const {
-    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  /// \brief Return the value's bytes in big-endian byte order
+  constexpr std::array<uint8_t, 2> ToBigEndian() const {
+#if ARROW_LITTLE_ENDIAN
+    return {uint8_t(value_ >> 8), uint8_t(value_ & 0xff)};
+#else
+    return {uint8_t(value_ & 0xff), uint8_t(value_ >> 8)};
+#endif
   }
 
   friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
@@ -98,13 +117,10 @@ class Float16Base {
     return !Float16Base::CompareLt(rhs, lhs);
   }
   friend constexpr bool operator>=(Float16Base lhs, Float16Base rhs) {
-    if (lhs.is_nan() || rhs.is_nan()) return false;
-    return !Float16Base::CompareLt(lhs, rhs);
+    return rhs <= lhs;
   }
 
-  friend std::ostream& operator<<(std::ostream& os, Float16Base arg) {
-    return (os << arg.bits());
-  }
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16Base arg);
 
  protected:
   uint16_t value_;
@@ -118,7 +134,7 @@ class Float16Base {
     if (lhs.signbit()) {
       if (rhs.signbit()) {
         // Both are negative
-        return (lhs.bits() & 0x7fff) > (rhs.bits() & 0x7fff);
+        return lhs.bits() > rhs.bits();
       } else {
         // Handle +/-0
         return !lhs.is_zero() || rhs.bits() != 0;
@@ -127,7 +143,7 @@ class Float16Base {
       return false;
     } else {
       // Both are positive
-      return (lhs.bits() & 0x7fff) < (rhs.bits() & 0x7fff);
+      return lhs.bits() < rhs.bits();
     }
   }
 };
@@ -145,9 +161,12 @@ class Float16 : public Float16Base {
     return Float16(SafeLoadAs<uint16_t>(src));
   }
 
+  /// \brief Read a `Float16` from memory in little-endian byte order
   static Float16 FromLittleEndian(const uint8_t* src) {
     return Float16(bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
   }
+
+  /// \brief Read a `Float16` from memory in big-endian byte order
   static Float16 FromBigEndian(const uint8_t* src) {
     return Float16(bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
   }

diff --git a/cpp/src/arrow/util/float16_test.cc b/cpp/src/arrow/util/float16_test.cc
@@ -98,10 +98,8 @@ class Float16OperatorTest : public ::testing::Test {
     const auto num_values = static_cast<int>(test_values.size());
 
     // Check all combinations of operands in both directions
-    for (int offset = 0; offset < num_values; ++offset) {
-      int i = 0;
-      int j = offset;
-      while (i < num_values) {
+    for (int i = 0; i < num_values; ++i) {
+      for (int j = 0; j < num_values; ++j) {
         ARROW_SCOPED_TRACE(i, ",", j);
 
         auto a = test_values[i];
@@ -110,9 +108,6 @@ class Float16OperatorTest : public ::testing::Test {
         // Results for float16 and float32 should be the same
         auto ret = Operator{}(a, b);
         ASSERT_EQ(ret.first, ret.second);
-
-        ++i;
-        j = (j + 1) % num_values;
       }
     }
   }
@@ -127,19 +122,32 @@ TYPED_TEST(Float16OperatorTest, Compare) { this->TestCompare(g_test_values); }
 
 TEST(Float16Test, ToBytes) {
   constexpr auto f16 = Float16(0xd01c);
-  auto bytes = f16.ToBytes();
-  ASSERT_EQ(SafeLoadAs<uint16_t>(bytes.data()), 0xd01c);
+  std::array<uint8_t, 2> bytes;
+  auto load = [&bytes]() { return SafeLoadAs<uint16_t>(bytes.data()); };
+
+  // Test native-endian
+  f16.ToBytes(bytes.data());
+  ASSERT_EQ(load(), 0xd01c);
+  bytes = f16.ToBytes();
+  ASSERT_EQ(load(), 0xd01c);
+
 #if ARROW_LITTLE_ENDIAN
-  bytes = f16.ToLittleEndian();
-  ASSERT_EQ(SafeLoadAs<uint16_t>(bytes.data()), 0xd01c);
-  bytes = f16.ToBigEndian();
-  ASSERT_EQ(SafeLoadAs<uint16_t>(bytes.data()), 0x1cd0);
+  constexpr uint16_t expected_le = 0xd01c;
+  constexpr uint16_t expected_be = 0x1cd0;
 #else
+  constexpr uint16_t expected_le = 0x1cd0;
+  constexpr uint16_t expected_be = 0xd01c;
+#endif
+  // Test little-endian
+  f16.ToLittleEndian(bytes.data());
+  ASSERT_EQ(load(), expected_le);
   bytes = f16.ToLittleEndian();
-  ASSERT_EQ(SafeLoadAs<uint16_t>(bytes.data()), 0x1cd0);
+  ASSERT_EQ(load(), expected_le);
+  // Test big-endian
+  f16.ToBigEndian(bytes.data());
+  ASSERT_EQ(load(), expected_be);
   bytes = f16.ToBigEndian();
-  ASSERT_EQ(SafeLoadAs<uint16_t>(bytes.data()), 0xd01c);
-#endif
+  ASSERT_EQ(load(), expected_be);
 }
 
 TEST(Float16Test, FromBytes) {

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
@@ -57,22 +57,20 @@ constexpr int value_length(int type_length, const FLBA& value) { return type_len
 
 // Static "constants" for normalizing float16 min/max values. These need to be expressed
 // as pointers because `Float16LogicalType` represents an FLBA.
-const uint8_t* float16_lowest() {
-  static const auto bytes = std::numeric_limits<Float16>::lowest().ToLittleEndian();
-  return bytes.data();
-}
-const uint8_t* float16_max() {
-  static const auto bytes = std::numeric_limits<Float16>::max().ToLittleEndian();
-  return bytes.data();
-}
-const uint8_t* float16_positive_zero() {
-  static const auto bytes = Float16(0).ToLittleEndian();
-  return bytes.data();
-}
-const uint8_t* float16_negative_zero() {
-  static const auto bytes = (-Float16(0)).ToLittleEndian();
-  return bytes.data();
-}
+struct Float16Constants {
+  static constexpr const uint8_t* lowest() { return lowest_.data(); }
+  static constexpr const uint8_t* max() { return max_.data(); }
+  static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); }
+  static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); }
+
+ private:
+  using Bytes = std::array<uint8_t, 2>;
+  static constexpr Bytes lowest_ =
+      std::numeric_limits<Float16>::lowest().ToLittleEndian();
+  static constexpr Bytes max_ = std::numeric_limits<Float16>::max().ToLittleEndian();
+  static constexpr Bytes positive_zero_ = (+Float16(0)).ToLittleEndian();
+  static constexpr Bytes negative_zero_ = (-Float16(0)).ToLittleEndian();
+};
 
 template <typename DType, bool is_signed>
 struct CompareHelper {
@@ -301,12 +299,12 @@ struct CompareHelper<FLBAType, is_signed>
 struct Float16CompareHelper {
   using T = FLBA;
 
-  static T DefaultMin() { return T{float16_max()}; }
-  static T DefaultMax() { return T{float16_lowest()}; }
+  static T DefaultMin() { return T{Float16Constants::max()}; }
+  static T DefaultMax() { return T{Float16Constants::lowest()}; }
 
   static T Coalesce(T val, T fallback) {
-    return val.ptr != nullptr && Float16::FromLittleEndian(val.ptr).is_nan() ? fallback
-                                                                             : val;
+    return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback
+                                                                               : val;
   }
 
   static inline bool Compare(int type_length, const T& a, const T& b) {
@@ -386,10 +384,10 @@ optional<std::pair<FLBA, FLBA>> CleanFloat16Statistic(std::pair<FLBA, FLBA> min_
   }
 
   if (min == Float16(0)) {
-    min_flba = FLBA{float16_negative_zero()};
+    min_flba = FLBA{Float16Constants::negative_zero()};
   }
   if (max == -Float16(0)) {
-    max_flba = FLBA{float16_positive_zero()};
+    max_flba = FLBA{Float16Constants::positive_zero()};
   }
 
   return {{min_flba, max_flba}};
@@ -540,13 +538,13 @@ std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMi
   return GetMinMaxBinaryHelper<false>(*this, values);
 }
 
-static LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) {
+LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) {
   if (const auto& logical_type = descr->logical_type()) {
     return logical_type->type();
   }
   return LogicalType::Type::NONE;
 }
-static LogicalType::Type::type LogicalTypeId(const Statistics& stats) {
+LogicalType::Type::type LogicalTypeId(const Statistics& stats) {
   return LogicalTypeId(stats.descr());
 }
 
@@ -613,20 +611,26 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void IncrementNumValues(int64_t n) override { num_values_ += n; }
 
+  static bool IsMeaningfulLogicalType(LogicalType::Type::type type) {
+    switch (type) {
+      case LogicalType::Type::FLOAT16:
+        return true;
+      default:
+        return false;
+    }
+  }
+
   bool Equals(const Statistics& raw_other) const override {
     if (physical_type() != raw_other.physical_type()) return false;
 
     const auto logical_id = LogicalTypeId(*this);
-    switch (logical_id) {
-      // Only compare against logical types that influence the interpretation of the
-      // physical type
-      case LogicalType::Type::FLOAT16:
-        if (LogicalTypeId(raw_other) != logical_id) {
-          return false;
-        }
-        break;
-      default:
-        break;
+    const auto other_logical_id = LogicalTypeId(raw_other);
+    // Only compare against logical types that influence the interpretation of the
+    // physical type
+    if (IsMeaningfulLogicalType(logical_id)) {
+      if (logical_id != other_logical_id) return false;
+    } else if (IsMeaningfulLogicalType(other_logical_id)) {
+      return false;
     }
 
     const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
@@ -886,7 +890,7 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
       case Type::FIXED_LEN_BYTE_ARRAY:
         if (logical_type == LogicalType::Type::FLOAT16) {
           return std::make_shared<
-              TypedComparatorImpl<true, FLBAType, Float16CompareHelper>>();
+              TypedComparatorImpl<true, FLBAType, Float16CompareHelper>>(type_length);
         }
         return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
       default: