Skip to content
This repository has been archived by the owner on May 10, 2024. It is now read-only.

Commit

Permalink
PARQUET-1357: FormatStatValue truncates binary statistics on zero cha…
Browse files Browse the repository at this point in the history
…racter

Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>

Closes #479 from xhochy/PARQUET-1357 and squashes the following commits:

4135976 [Korn, Uwe] Add macro for deprecations
e0e4946 [Korn, Uwe] Update to clang-format 6.0
0073b56 [Korn, Uwe] PARQUET-1357: FormatStatValue truncates binary statistics on zero character
  • Loading branch information
xhochy committed Aug 1, 2018
1 parent 4906248 commit 646e225
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 10 deletions.
26 changes: 21 additions & 5 deletions cmake_modules/FindClangTools.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,20 @@
# CLANG_FORMAT_BIN, The path to the clang format binary
# CLANG_TIDY_FOUND, Whether clang format was found

if (DEFINED ENV{HOMEBREW_PREFIX})
set(HOMEBREW_PREFIX "${ENV{HOMEBREW_PREFIX}")
else()
set(HOMEBREW_PREFIX "/usr/local")
endif()

find_program(CLANG_TIDY_BIN
NAMES clang-tidy-4.0
clang-tidy-3.9
clang-tidy-3.8
clang-tidy-3.7
clang-tidy-3.6
clang-tidy
PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)

Expand All @@ -55,7 +61,7 @@ if (CLANG_FORMAT_VERSION)
PATHS
${ClangTools_PATH}
$ENV{CLANG_TOOLS_PATH}
/usr/local/bin /usr/bin
/usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)

Expand All @@ -67,16 +73,26 @@ if (CLANG_FORMAT_VERSION)
if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0")
find_program(CLANG_FORMAT_BIN
NAMES clang-format
PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin
PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin"
NO_DEFAULT_PATH
)
else()
find_program(CLANG_FORMAT_BIN
NAMES clang-format
PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin
PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_VERSION}/bin"
NO_DEFAULT_PATH
)
endif()

if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")
# binary was still not found, look into Cellar
file(GLOB CLANG_FORMAT_PATH "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*")
find_program(CLANG_FORMAT_BIN
NAMES clang-format
PATHS "${CLANG_FORMAT_PATH}/bin"
NO_DEFAULT_PATH
)
endif()
endif()
else()
find_program(CLANG_FORMAT_BIN
Expand All @@ -86,7 +102,7 @@ else()
clang-format-3.7
clang-format-3.6
clang-format
PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)
endif()
Expand Down
9 changes: 4 additions & 5 deletions src/parquet/printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << ", Null Values: " << stats->null_count()
<< ", Distinct Values: " << stats->distinct_count() << std::endl
<< " Max: " << FormatStatValue(descr->physical_type(), max.c_str())
<< ", Min: " << FormatStatValue(descr->physical_type(), min.c_str());
<< " Max: " << FormatStatValue(descr->physical_type(), max)
<< ", Min: " << FormatStatValue(descr->physical_type(), min);
} else {
stream << " Statistics Not Set";
}
Expand Down Expand Up @@ -207,9 +207,8 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str())
<< "\", "
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str())
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
<< "\" },";
} else {
stream << "\"False\",";
Expand Down
27 changes: 27 additions & 0 deletions src/parquet/types-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,54 +62,81 @@ TEST(TestLogicalTypeToString, LogicalTypes) {
}

TEST(TypePrinter, StatisticsTypes) {
#if !(defined(_WIN32) || defined(__CYGWIN__))
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
std::string smin;
std::string smax;
int32_t int_min = 1024;
int32_t int_max = 2048;
smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin.c_str()).c_str());
ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax.c_str()).c_str());

int64_t int64_min = 10240000000000;
int64_t int64_max = 20480000000000;
smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin.c_str()).c_str());
ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax.c_str()).c_str());

float float_min = 1.024f;
float float_max = 2.048f;
smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
smax = std::string(reinterpret_cast<char*>(&float_max), sizeof(float));
ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin).c_str());
ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin.c_str()).c_str());
ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax).c_str());
ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax.c_str()).c_str());

double double_min = 1.0245;
double double_max = 2.0489;
smin = std::string(reinterpret_cast<char*>(&double_min), sizeof(double));
smax = std::string(reinterpret_cast<char*>(&double_max), sizeof(double));
ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin.c_str()).c_str());
ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax.c_str()).c_str());

Int96 Int96_min = {{1024, 2048, 4096}};
Int96 Int96_max = {{2048, 4096, 8192}};
smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin.c_str()).c_str());
ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax).c_str());
ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax.c_str()).c_str());

smin = std::string("abcdef");
smax = std::string("ijklmnop");
ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str());
ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin.c_str()).c_str());
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str());
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax.c_str()).c_str());

// PARQUET-1357: FormatStatValue truncates binary statistics on zero character
smax.push_back('\0');
ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax));
// This fails, thus the call to FormatStatValue(.., const char*) was deprecated.
// ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax.c_str()));

smin = std::string("abcdefgh");
smax = std::string("ijklmnop");
ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin).c_str());
ASSERT_STREQ("abcdefgh",
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin.c_str()).c_str());
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str());
ASSERT_STREQ("ijklmnop",
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax.c_str()).c_str());
#if !(defined(_WIN32) || defined(__CYGWIN__))
#pragma GCC diagnostic pop
#endif
}

} // namespace parquet
35 changes: 35 additions & 0 deletions src/parquet/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,41 @@

namespace parquet {

std::string FormatStatValue(Type::type parquet_type, const std::string& val) {
std::stringstream result;
switch (parquet_type) {
case Type::BOOLEAN:
result << reinterpret_cast<const bool*>(val.c_str())[0];
break;
case Type::INT32:
result << reinterpret_cast<const int32_t*>(val.c_str())[0];
break;
case Type::INT64:
result << reinterpret_cast<const int64_t*>(val.c_str())[0];
break;
case Type::DOUBLE:
result << reinterpret_cast<const double*>(val.c_str())[0];
break;
case Type::FLOAT:
result << reinterpret_cast<const float*>(val.c_str())[0];
break;
case Type::INT96: {
auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str());
result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
break;
}
case Type::BYTE_ARRAY: {
return val;
}
case Type::FIXED_LEN_BYTE_ARRAY: {
return val;
}
default:
break;
}
return result.str();
}

std::string FormatStatValue(Type::type parquet_type, const char* val) {
std::stringstream result;
switch (parquet_type) {
Expand Down
6 changes: 6 additions & 0 deletions src/parquet/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include "arrow/util/macros.h"

#include "parquet/util/macros.h"
#include "parquet/util/visibility.h"

namespace parquet {
Expand Down Expand Up @@ -292,6 +293,11 @@ PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t);

PARQUET_EXPORT std::string TypeToString(Type::type t);

PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
const std::string& val);

/// \deprecated Since 1.5.0
PARQUET_DEPRECATED("Use std::string instead of char* as input")
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);

PARQUET_EXPORT int GetTypeByteSize(Type::type t);
Expand Down
15 changes: 15 additions & 0 deletions src/parquet/util/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,19 @@
#define FRIEND_TEST(test_case_name, test_name) \
friend class test_case_name##_##test_name##_Test

// clang-format off
// [[deprecated]] is only available in C++14, use this for the time being
// This macro takes an optional deprecation message
#if __cplusplus <= 201103L
# ifdef __GNUC__
# define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__)))
# elif defined(_MSC_VER)
# define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__))
# else
# define PARQUET_DEPRECATED(...)
# endif
#else
# define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
#endif

#endif // PARQUET_UTIL_MACROS_H

0 comments on commit 646e225

Please sign in to comment.