From eec94ff4047b9e1bbcf3da46f1c34e3687a171bb Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Thu, 29 Sep 2022 00:52:56 +0800 Subject: [PATCH] ARROW-17847: [C++] Support unquoted decimal in JSON parser (#14242) Support both quoted and unquoted decimal in JSON parser automatically. Authored-by: Jin Shang Signed-off-by: Antoine Pitrou --- cpp/src/arrow/json/parser.cc | 44 ++++++++++++++++++++++++++----- cpp/src/arrow/json/parser.h | 10 ++++++- cpp/src/arrow/json/parser_test.cc | 19 +++++++++++++ cpp/src/arrow/json/reader_test.cc | 31 ++++++++++++++++++++++ cpp/src/arrow/json/test_common.h | 14 ++++++++++ 5 files changed, 110 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index cd32b4433f1f8..79ea9a89e752e 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -55,8 +55,9 @@ static Status ParseError(T&&... t) { } const std::string& Kind::Name(Kind::type kind) { - static const std::string names[] = {"null", "boolean", "number", - "string", "array", "object"}; + static const std::string names[] = { + "null", "boolean", "number", "string", "array", "object", "number_or_string", + }; return names[kind]; } @@ -69,6 +70,7 @@ const std::shared_ptr& Kind::Tag(Kind::type kind) { key_value_metadata({{"json_kind", Kind::Name(Kind::kString)}}), key_value_metadata({{"json_kind", Kind::Name(Kind::kArray)}}), key_value_metadata({{"json_kind", Kind::Name(Kind::kObject)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kNumberOrString)}}), }; return tags[kind]; } @@ -76,7 +78,7 @@ const std::shared_ptr& Kind::Tag(Kind::type kind) { static arrow::internal::Trie MakeFromTagTrie() { arrow::internal::TrieBuilder builder; for (auto kind : {Kind::kNull, Kind::kBoolean, Kind::kNumber, Kind::kString, - Kind::kArray, Kind::kObject}) { + Kind::kArray, Kind::kObject, Kind::kNumberOrString}) { DCHECK_OK(builder.Append(Kind::Name(kind))); } auto name_to_kind = builder.Finish(); @@ -102,7 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } - Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { return Kind::ForType(*dict_type.value_type(), kind_); } @@ -391,6 +393,12 @@ class RawArrayBuilder { TypedBufferBuilder null_bitmap_builder_; }; +template <> +class RawArrayBuilder : public ScalarBuilder { + public: + using ScalarBuilder::ScalarBuilder; +}; + class RawBuilderSet { public: explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {} @@ -430,6 +438,9 @@ class RawBuilderSet { case Kind::kString: return MakeBuilder(leading_nulls, builder); + case Kind::kNumberOrString: + return MakeBuilder(leading_nulls, builder); + case Kind::kArray: { RETURN_NOT_OK(MakeBuilder(leading_nulls, builder)); const auto& list_type = checked_cast(t); @@ -491,6 +502,10 @@ class RawBuilderSet { case Kind::kString: return Cast(builder)->AppendNull(); + case Kind::kNumberOrString: { + return Cast(builder)->AppendNull(); + } + case Kind::kArray: return Cast(builder)->AppendNull(); @@ -504,6 +519,7 @@ class RawBuilderSet { } return Status::OK(); } + default: return Status::NotImplemented("invalid builder Kind"); } @@ -530,6 +546,9 @@ class RawBuilderSet { case Kind::kString: return FinishScalar(scalar_values, Cast(builder), out); + case Kind::kNumberOrString: + return FinishScalar(scalar_values, Cast(builder), out); + case Kind::kArray: return Cast(builder)->Finish(std::move(finish_children), out); @@ -563,7 +582,8 @@ class RawBuilderSet { std::vector>, std::vector>, std::vector>, - std::vector>> + std::vector>, + std::vector>> arenas_; }; @@ -610,12 +630,22 @@ class HandlerBase : public BlockParser, } bool RawNumber(const char* data, rj::SizeType size, ...) { - status_ = AppendScalar(builder_, std::string_view(data, size)); + if (builder_.kind == Kind::kNumberOrString) { + status_ = + AppendScalar(builder_, std::string_view(data, size)); + } else { + status_ = AppendScalar(builder_, std::string_view(data, size)); + } return status_.ok(); } bool String(const char* data, rj::SizeType size, ...) { - status_ = AppendScalar(builder_, std::string_view(data, size)); + if (builder_.kind == Kind::kNumberOrString) { + status_ = + AppendScalar(builder_, std::string_view(data, size)); + } else { + status_ = AppendScalar(builder_, std::string_view(data, size)); + } return status_.ok(); } diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h index 4dd14e4b80c68..e21d09c4169d0 100644 --- a/cpp/src/arrow/json/parser.h +++ b/cpp/src/arrow/json/parser.h @@ -37,7 +37,15 @@ class ResizableBuffer; namespace json { struct Kind { - enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; + enum type : uint8_t { + kNull, + kBoolean, + kNumber, + kString, + kArray, + kObject, + kNumberOrString + }; static const std::string& Name(Kind::type); diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index e1f346bda3b49..9e2ae47c95e59 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -29,6 +29,7 @@ #include "arrow/json/test_common.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -136,6 +137,24 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) { "[\"thing\", null, \"\xe5\xbf\x8d\", null]"}); } +TEST(BlockParserWithSchema, UnquotedDecimal) { + auto options = ParseOptions::Defaults(); + options.explicit_schema = + schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + AssertParseColumns(options, unquoted_decimal_src(), + {field("price", utf8()), field("cost", utf8())}, + {R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"}); +} + +TEST(BlockParserWithSchema, MixedDecimal) { + auto options = ParseOptions::Defaults(); + options.explicit_schema = + schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + AssertParseColumns(options, mixed_decimal_src(), + {field("price", utf8()), field("cost", utf8())}, + {R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"}); +} + class BlockParserTypeError : public ::testing::TestWithParam { public: ParseOptions Options(std::shared_ptr explicit_schema) { diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 4037bf0be66d1..452409209c4ce 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -27,6 +27,7 @@ #include "arrow/json/test_common.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" namespace arrow { namespace json { @@ -203,6 +204,36 @@ TEST_P(ReaderTest, MultipleChunks) { AssertTablesEqual(*expected_table, *table_); } +TEST_P(ReaderTest, UnquotedDecimal) { + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = unquoted_decimal_src(); + SetUpReader(src); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); + AssertTablesEqual(*expected_table, *table_); +} + +TEST_P(ReaderTest, MixedDecimal) { + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = mixed_decimal_src(); + SetUpReader(src); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); + AssertTablesEqual(*expected_table, *table_); +} + TEST(ReaderTest, MultipleChunksParallel) { int64_t count = 1 << 10; diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 18007a4963845..a65932964895c 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -259,5 +259,19 @@ inline static std::string null_src() { )"; } +inline static std::string unquoted_decimal_src() { + return R"( + { "price": 30.04, "cost":30.001 } + { "price": 1.23, "cost":1.229 } + )"; +} + +inline static std::string mixed_decimal_src() { + return R"( + { "price": 30.04, "cost": 30.001 } + { "price": "1.23", "cost": "1.229" } + )"; +} + } // namespace json } // namespace arrow