From 4706508db2a7b9e585bccb929ad0820962d8b955 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Mon, 26 Sep 2022 15:33:46 +0000 Subject: [PATCH 1/4] ARROW-17847: [C++] Support unquoted decimal in JSON parser --- cpp/src/arrow/json/options.h | 3 +++ cpp/src/arrow/json/parser.cc | 32 +++++++++++++++++++------------ cpp/src/arrow/json/parser.h | 7 +++++-- cpp/src/arrow/json/parser_test.cc | 20 +++++++++++++++++++ cpp/src/arrow/json/reader_test.cc | 28 +++++++++++++++++++++++++++ cpp/src/arrow/json/test_common.h | 14 ++++++++++++++ 6 files changed, 90 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h index d7edab9ceddb4..d21eff07fc6aa 100644 --- a/cpp/src/arrow/json/options.h +++ b/cpp/src/arrow/json/options.h @@ -53,6 +53,9 @@ struct ARROW_EXPORT ParseOptions { /// How JSON fields outside of explicit_schema (if given) are treated UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + /// Whether decimals are represented as strings(quoted) or numbers(unquoted) + bool parse_decimal_as_number = false; + /// Create parsing options with default values static ParseOptions Defaults(); }; diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index cd32b4433f1f8..6129d2d749513 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -26,6 +26,7 @@ #include #include +#include "arrow/json/options.h" #include "arrow/json/rapidjson_defs.h" #include "rapidjson/error/en.h" #include "rapidjson/reader.h" @@ -92,7 +93,8 @@ Kind::type Kind::FromTag(const std::shared_ptr& tag) { return static_cast(name_to_kind.Find(name)); } -Status Kind::ForType(const DataType& type, Kind::type* kind) { +Status Kind::ForType(const DataType& type, const ParseOptions& options, + Kind::type* kind) { struct { Status Visit(const NullType&) { return SetKind(Kind::kNull); } Status Visit(const BooleanType&) { return SetKind(Kind::kBoolean); } @@ -102,9 +104,12 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } - Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const FixedSizeBinaryType&) { + return options_.parse_decimal_as_number ? SetKind(Kind::kNumber) + : SetKind(Kind::kString); + } Status Visit(const DictionaryType& dict_type) { - return Kind::ForType(*dict_type.value_type(), kind_); + return Kind::ForType(*dict_type.value_type(), options_, kind_); } Status Visit(const ListType&) { return SetKind(Kind::kArray); } Status Visit(const StructType&) { return SetKind(Kind::kObject); } @@ -115,8 +120,9 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { *kind_ = kind; return Status::OK(); } + const ParseOptions& options_; Kind::type* kind_; - } visitor = {kind}; + } visitor = {options, kind}; return VisitTypeInline(type, &visitor); } @@ -393,7 +399,8 @@ class RawArrayBuilder { class RawBuilderSet { public: - explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {} + explicit RawBuilderSet(MemoryPool* pool, const ParseOptions& options) + : pool_(pool), options_(options) {} /// Retrieve a pointer to a builder from a BuilderPtr template @@ -415,7 +422,7 @@ class RawBuilderSet { /// construct a builder of whatever kind corresponds to a DataType Status MakeBuilder(const DataType& t, int64_t leading_nulls, BuilderPtr* builder) { Kind::type kind; - RETURN_NOT_OK(Kind::ForType(t, &kind)); + RETURN_NOT_OK(Kind::ForType(t, options_, &kind)); switch (kind) { case Kind::kNull: *builder = BuilderPtr(Kind::kNull, static_cast(leading_nulls), true); @@ -565,6 +572,7 @@ class RawBuilderSet { std::vector>, std::vector>> arenas_; + const ParseOptions& options_; }; /// Three implementations are provided for BlockParser, one for each @@ -573,9 +581,9 @@ class RawBuilderSet { class HandlerBase : public BlockParser, public rj::BaseReaderHandler, HandlerBase> { public: - explicit HandlerBase(MemoryPool* pool) - : BlockParser(pool), - builder_set_(pool), + explicit HandlerBase(MemoryPool* pool, const ParseOptions& options) + : BlockParser(pool, options), + builder_set_(pool, options), field_index_(-1), scalar_values_builder_(pool) {} @@ -1086,15 +1094,15 @@ Status BlockParser::Make(MemoryPool* pool, const ParseOptions& options, switch (options.unexpected_field_behavior) { case UnexpectedFieldBehavior::Ignore: { - *out = std::make_unique>(pool); + *out = std::make_unique>(pool, options); break; } case UnexpectedFieldBehavior::Error: { - *out = std::make_unique>(pool); + *out = std::make_unique>(pool, options); break; } case UnexpectedFieldBehavior::InferType: - *out = std::make_unique>(pool); + *out = std::make_unique>(pool, options); break; } return static_cast(**out).Initialize(options.explicit_schema); diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h index 4dd14e4b80c68..486de229ca5fc 100644 --- a/cpp/src/arrow/json/parser.h +++ b/cpp/src/arrow/json/parser.h @@ -45,7 +45,8 @@ struct Kind { static Kind::type FromTag(const std::shared_ptr& tag); - static Status ForType(const DataType& type, Kind::type* kind); + static Status ForType(const DataType& type, const ParseOptions& options, + Kind::type* kind); }; constexpr int32_t kMaxParserNumRows = 100000; @@ -91,10 +92,12 @@ class ARROW_EXPORT BlockParser { protected: ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); - explicit BlockParser(MemoryPool* pool) : pool_(pool) {} + explicit BlockParser(MemoryPool* pool, const ParseOptions& options) + : pool_(pool), options_(options) {} MemoryPool* pool_; int32_t num_rows_ = 0; + const ParseOptions& options_; }; } // namespace json diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index e1f346bda3b49..2202f86c96aae 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -29,6 +29,7 @@ #include "arrow/json/test_common.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -136,6 +137,25 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) { "[\"thing\", null, \"\xe5\xbf\x8d\", null]"}); } +TEST(BlockParserWithSchema, UnquotedDecimal) { + auto options = ParseOptions::Defaults(); + options.explicit_schema = + schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + options.parse_decimal_as_number = true; + AssertParseColumns(options, unquoted_decimal_src(), + {field("price", utf8()), field("cost", utf8())}, + {R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"}); +} + +TEST(BlockParserWithSchema, MixedDecimal) { + auto options = ParseOptions::Defaults(); + options.explicit_schema = + schema({field("price", decimal(9, 2)), field("cost", decimal(9, 2))}); + options.parse_decimal_as_number = true; + std::shared_ptr parsed; + ASSERT_RAISES(Invalid, ParseFromString(options, mixed_decimal_src(), &parsed)); +} + class BlockParserTypeError : public ::testing::TestWithParam { public: ParseOptions Options(std::shared_ptr explicit_schema) { diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 4037bf0be66d1..0cee170dccb03 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -27,6 +27,7 @@ #include "arrow/json/test_common.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" namespace arrow { namespace json { @@ -274,5 +275,32 @@ TEST(ReaderTest, ListArrayWithFewValues) { AssertTablesEqual(*actual_table, *expected_table); } +TEST_P(ReaderTest, UnquotedDecimal) { + parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + parse_options_.parse_decimal_as_number = true; + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = unquoted_decimal_src(); + SetUpReader(src); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = Table::Make( + schema, {ArrayFromJSON(schema->field(0)->type(), R"(["30.04", "1.23"])"), + ArrayFromJSON(schema->field(1)->type(), R"(["30.001", "1.229"])")}); + AssertTablesEqual(*expected_table, *table_); +} + +TEST_P(ReaderTest, MixedDecimal) { + parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + parse_options_.parse_decimal_as_number = true; + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = mixed_decimal_src(); + SetUpReader(src); + ASSERT_RAISES(Invalid, reader_->Read()); +} + } // namespace json } // namespace arrow diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 18007a4963845..33177fa85b67a 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -259,5 +259,19 @@ inline static std::string null_src() { )"; } +inline static std::string unquoted_decimal_src() { + return R"( + { "price": 30.04, "cost":30.001 } + { "price": 1.23, "cost":1.229 } + )"; +} + +inline static std::string mixed_decimal_src() { + return R"( + { "price": 30.04, "cost": "30.001" } + { "price": 1.23, "cost": "1.229" } + )"; +} + } // namespace json } // namespace arrow From c8ef16bbd0e89029cd374606bf9fcd2b65cd5eee Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Tue, 27 Sep 2022 15:56:03 +0000 Subject: [PATCH 2/4] switch to kStringOrNumber type, support both quoted and unquoted automatically --- cpp/src/arrow/json/options.h | 3 -- cpp/src/arrow/json/parser.cc | 70 +++++++++++++++++++------------ cpp/src/arrow/json/parser.h | 17 +++++--- cpp/src/arrow/json/parser_test.cc | 9 ++-- cpp/src/arrow/json/reader_test.cc | 9 ++-- 5 files changed, 65 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h index d21eff07fc6aa..d7edab9ceddb4 100644 --- a/cpp/src/arrow/json/options.h +++ b/cpp/src/arrow/json/options.h @@ -53,9 +53,6 @@ struct ARROW_EXPORT ParseOptions { /// How JSON fields outside of explicit_schema (if given) are treated UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - /// Whether decimals are represented as strings(quoted) or numbers(unquoted) - bool parse_decimal_as_number = false; - /// Create parsing options with default values static ParseOptions Defaults(); }; diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index 6129d2d749513..c7ac60034437d 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -26,7 +26,6 @@ #include #include -#include "arrow/json/options.h" #include "arrow/json/rapidjson_defs.h" #include "rapidjson/error/en.h" #include "rapidjson/reader.h" @@ -56,8 +55,8 @@ static Status ParseError(T&&... t) { } const std::string& Kind::Name(Kind::type kind) { - static const std::string names[] = {"null", "boolean", "number", - "string", "array", "object"}; + static const std::string names[] = {"null", "boolean", "number", "string", + "array", "object", "number_or_string"}; return names[kind]; } @@ -70,6 +69,7 @@ const std::shared_ptr& Kind::Tag(Kind::type kind) { key_value_metadata({{"json_kind", Kind::Name(Kind::kString)}}), key_value_metadata({{"json_kind", Kind::Name(Kind::kArray)}}), key_value_metadata({{"json_kind", Kind::Name(Kind::kObject)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kNumberOrString)}}), }; return tags[kind]; } @@ -77,7 +77,7 @@ const std::shared_ptr& Kind::Tag(Kind::type kind) { static arrow::internal::Trie MakeFromTagTrie() { arrow::internal::TrieBuilder builder; for (auto kind : {Kind::kNull, Kind::kBoolean, Kind::kNumber, Kind::kString, - Kind::kArray, Kind::kObject}) { + Kind::kArray, Kind::kObject, Kind::kNumberOrString}) { DCHECK_OK(builder.Append(Kind::Name(kind))); } auto name_to_kind = builder.Finish(); @@ -93,8 +93,7 @@ Kind::type Kind::FromTag(const std::shared_ptr& tag) { return static_cast(name_to_kind.Find(name)); } -Status Kind::ForType(const DataType& type, const ParseOptions& options, - Kind::type* kind) { +Status Kind::ForType(const DataType& type, Kind::type* kind) { struct { Status Visit(const NullType&) { return SetKind(Kind::kNull); } Status Visit(const BooleanType&) { return SetKind(Kind::kBoolean); } @@ -104,12 +103,9 @@ Status Kind::ForType(const DataType& type, const ParseOptions& options, Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } - Status Visit(const FixedSizeBinaryType&) { - return options_.parse_decimal_as_number ? SetKind(Kind::kNumber) - : SetKind(Kind::kString); - } + Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { - return Kind::ForType(*dict_type.value_type(), options_, kind_); + return Kind::ForType(*dict_type.value_type(), kind_); } Status Visit(const ListType&) { return SetKind(Kind::kArray); } Status Visit(const StructType&) { return SetKind(Kind::kObject); } @@ -120,9 +116,8 @@ Status Kind::ForType(const DataType& type, const ParseOptions& options, *kind_ = kind; return Status::OK(); } - const ParseOptions& options_; Kind::type* kind_; - } visitor = {options, kind}; + } visitor = {kind}; return VisitTypeInline(type, &visitor); } @@ -397,10 +392,15 @@ class RawArrayBuilder { TypedBufferBuilder null_bitmap_builder_; }; +template <> +class RawArrayBuilder : public ScalarBuilder { + public: + using ScalarBuilder::ScalarBuilder; +}; + class RawBuilderSet { public: - explicit RawBuilderSet(MemoryPool* pool, const ParseOptions& options) - : pool_(pool), options_(options) {} + explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {} /// Retrieve a pointer to a builder from a BuilderPtr template @@ -422,7 +422,7 @@ class RawBuilderSet { /// construct a builder of whatever kind corresponds to a DataType Status MakeBuilder(const DataType& t, int64_t leading_nulls, BuilderPtr* builder) { Kind::type kind; - RETURN_NOT_OK(Kind::ForType(t, options_, &kind)); + RETURN_NOT_OK(Kind::ForType(t, &kind)); switch (kind) { case Kind::kNull: *builder = BuilderPtr(Kind::kNull, static_cast(leading_nulls), true); @@ -461,6 +461,8 @@ class RawBuilderSet { } return Status::OK(); } + case Kind::kNumberOrString: + return MakeBuilder(leading_nulls, builder); default: return Status::NotImplemented("invalid builder type"); } @@ -511,6 +513,9 @@ class RawBuilderSet { } return Status::OK(); } + case Kind::kNumberOrString: { + return Cast(builder)->AppendNull(); + } default: return Status::NotImplemented("invalid builder Kind"); } @@ -543,6 +548,9 @@ class RawBuilderSet { case Kind::kObject: return Cast(builder)->Finish(std::move(finish_children), out); + case Kind::kNumberOrString: + return FinishScalar(scalar_values, Cast(builder), out); + default: return Status::NotImplemented("invalid builder kind"); } @@ -570,9 +578,9 @@ class RawBuilderSet { std::vector>, std::vector>, std::vector>, - std::vector>> + std::vector>, + std::vector>> arenas_; - const ParseOptions& options_; }; /// Three implementations are provided for BlockParser, one for each @@ -581,9 +589,9 @@ class RawBuilderSet { class HandlerBase : public BlockParser, public rj::BaseReaderHandler, HandlerBase> { public: - explicit HandlerBase(MemoryPool* pool, const ParseOptions& options) - : BlockParser(pool, options), - builder_set_(pool, options), + explicit HandlerBase(MemoryPool* pool) + : BlockParser(pool), + builder_set_(pool), field_index_(-1), scalar_values_builder_(pool) {} @@ -618,12 +626,22 @@ class HandlerBase : public BlockParser, } bool RawNumber(const char* data, rj::SizeType size, ...) { - status_ = AppendScalar(builder_, std::string_view(data, size)); + if (builder_.kind == Kind::kNumberOrString) { + status_ = + AppendScalar(builder_, std::string_view(data, size)); + } else { + status_ = AppendScalar(builder_, std::string_view(data, size)); + } return status_.ok(); } bool String(const char* data, rj::SizeType size, ...) { - status_ = AppendScalar(builder_, std::string_view(data, size)); + if (builder_.kind == Kind::kNumberOrString) { + status_ = + AppendScalar(builder_, std::string_view(data, size)); + } else { + status_ = AppendScalar(builder_, std::string_view(data, size)); + } return status_.ok(); } @@ -1094,15 +1112,15 @@ Status BlockParser::Make(MemoryPool* pool, const ParseOptions& options, switch (options.unexpected_field_behavior) { case UnexpectedFieldBehavior::Ignore: { - *out = std::make_unique>(pool, options); + *out = std::make_unique>(pool); break; } case UnexpectedFieldBehavior::Error: { - *out = std::make_unique>(pool, options); + *out = std::make_unique>(pool); break; } case UnexpectedFieldBehavior::InferType: - *out = std::make_unique>(pool, options); + *out = std::make_unique>(pool); break; } return static_cast(**out).Initialize(options.explicit_schema); diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h index 486de229ca5fc..e21d09c4169d0 100644 --- a/cpp/src/arrow/json/parser.h +++ b/cpp/src/arrow/json/parser.h @@ -37,7 +37,15 @@ class ResizableBuffer; namespace json { struct Kind { - enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; + enum type : uint8_t { + kNull, + kBoolean, + kNumber, + kString, + kArray, + kObject, + kNumberOrString + }; static const std::string& Name(Kind::type); @@ -45,8 +53,7 @@ struct Kind { static Kind::type FromTag(const std::shared_ptr& tag); - static Status ForType(const DataType& type, const ParseOptions& options, - Kind::type* kind); + static Status ForType(const DataType& type, Kind::type* kind); }; constexpr int32_t kMaxParserNumRows = 100000; @@ -92,12 +99,10 @@ class ARROW_EXPORT BlockParser { protected: ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); - explicit BlockParser(MemoryPool* pool, const ParseOptions& options) - : pool_(pool), options_(options) {} + explicit BlockParser(MemoryPool* pool) : pool_(pool) {} MemoryPool* pool_; int32_t num_rows_ = 0; - const ParseOptions& options_; }; } // namespace json diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index 2202f86c96aae..9e2ae47c95e59 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -141,7 +141,6 @@ TEST(BlockParserWithSchema, UnquotedDecimal) { auto options = ParseOptions::Defaults(); options.explicit_schema = schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); - options.parse_decimal_as_number = true; AssertParseColumns(options, unquoted_decimal_src(), {field("price", utf8()), field("cost", utf8())}, {R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"}); @@ -150,10 +149,10 @@ TEST(BlockParserWithSchema, UnquotedDecimal) { TEST(BlockParserWithSchema, MixedDecimal) { auto options = ParseOptions::Defaults(); options.explicit_schema = - schema({field("price", decimal(9, 2)), field("cost", decimal(9, 2))}); - options.parse_decimal_as_number = true; - std::shared_ptr parsed; - ASSERT_RAISES(Invalid, ParseFromString(options, mixed_decimal_src(), &parsed)); + schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + AssertParseColumns(options, mixed_decimal_src(), + {field("price", utf8()), field("cost", utf8())}, + {R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"}); } class BlockParserTypeError : public ::testing::TestWithParam { diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 0cee170dccb03..2ae14d1966970 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -277,7 +277,6 @@ TEST(ReaderTest, ListArrayWithFewValues) { TEST_P(ReaderTest, UnquotedDecimal) { parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - parse_options_.parse_decimal_as_number = true; auto schema = ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); parse_options_.explicit_schema = schema; @@ -293,13 +292,17 @@ TEST_P(ReaderTest, UnquotedDecimal) { TEST_P(ReaderTest, MixedDecimal) { parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - parse_options_.parse_decimal_as_number = true; auto schema = ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); parse_options_.explicit_schema = schema; auto src = mixed_decimal_src(); SetUpReader(src); - ASSERT_RAISES(Invalid, reader_->Read()); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = Table::Make( + schema, {ArrayFromJSON(schema->field(0)->type(), R"(["30.04", "1.23"])"), + ArrayFromJSON(schema->field(1)->type(), R"(["30.001", "1.229"])")}); + AssertTablesEqual(*expected_table, *table_); } } // namespace json From 86e583e7bd8c303f589c359fdd81b5ea56142f74 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 28 Sep 2022 01:57:50 +0000 Subject: [PATCH 3/4] address pr feedback --- cpp/src/arrow/json/parser.cc | 5 +++-- cpp/src/arrow/json/reader_test.cc | 14 ++++++++------ cpp/src/arrow/json/test_common.h | 4 ++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index c7ac60034437d..d19b57b0e4998 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -55,8 +55,9 @@ static Status ParseError(T&&... t) { } const std::string& Kind::Name(Kind::type kind) { - static const std::string names[] = {"null", "boolean", "number", "string", - "array", "object", "number_or_string"}; + static const std::string names[] = { + "null", "boolean", "number", "string", "array", "object", "number_or_string", + }; return names[kind]; } diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 2ae14d1966970..eef117b777da8 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -284,9 +284,10 @@ TEST_P(ReaderTest, UnquotedDecimal) { SetUpReader(src); ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); - auto expected_table = Table::Make( - schema, {ArrayFromJSON(schema->field(0)->type(), R"(["30.04", "1.23"])"), - ArrayFromJSON(schema->field(1)->type(), R"(["30.001", "1.229"])")}); + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); AssertTablesEqual(*expected_table, *table_); } @@ -299,9 +300,10 @@ TEST_P(ReaderTest, MixedDecimal) { SetUpReader(src); ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); - auto expected_table = Table::Make( - schema, {ArrayFromJSON(schema->field(0)->type(), R"(["30.04", "1.23"])"), - ArrayFromJSON(schema->field(1)->type(), R"(["30.001", "1.229"])")}); + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); AssertTablesEqual(*expected_table, *table_); } diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 33177fa85b67a..a65932964895c 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -268,8 +268,8 @@ inline static std::string unquoted_decimal_src() { inline static std::string mixed_decimal_src() { return R"( - { "price": 30.04, "cost": "30.001" } - { "price": 1.23, "cost": "1.229" } + { "price": 30.04, "cost": 30.001 } + { "price": "1.23", "cost": "1.229" } )"; } From bbcd46bf67b38ac331e0a0f727f9a49071fa9719 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 28 Sep 2022 04:43:27 +0000 Subject: [PATCH 4/4] address pr feedback --- cpp/src/arrow/json/parser.cc | 21 ++++++----- cpp/src/arrow/json/reader_test.cc | 62 +++++++++++++++---------------- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index d19b57b0e4998..79ea9a89e752e 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -104,7 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } - Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kNumberOrString); } + Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { return Kind::ForType(*dict_type.value_type(), kind_); } @@ -438,6 +438,9 @@ class RawBuilderSet { case Kind::kString: return MakeBuilder(leading_nulls, builder); + case Kind::kNumberOrString: + return MakeBuilder(leading_nulls, builder); + case Kind::kArray: { RETURN_NOT_OK(MakeBuilder(leading_nulls, builder)); const auto& list_type = checked_cast(t); @@ -462,8 +465,6 @@ class RawBuilderSet { } return Status::OK(); } - case Kind::kNumberOrString: - return MakeBuilder(leading_nulls, builder); default: return Status::NotImplemented("invalid builder type"); } @@ -501,6 +502,10 @@ class RawBuilderSet { case Kind::kString: return Cast(builder)->AppendNull(); + case Kind::kNumberOrString: { + return Cast(builder)->AppendNull(); + } + case Kind::kArray: return Cast(builder)->AppendNull(); @@ -514,9 +519,7 @@ class RawBuilderSet { } return Status::OK(); } - case Kind::kNumberOrString: { - return Cast(builder)->AppendNull(); - } + default: return Status::NotImplemented("invalid builder Kind"); } @@ -543,15 +546,15 @@ class RawBuilderSet { case Kind::kString: return FinishScalar(scalar_values, Cast(builder), out); + case Kind::kNumberOrString: + return FinishScalar(scalar_values, Cast(builder), out); + case Kind::kArray: return Cast(builder)->Finish(std::move(finish_children), out); case Kind::kObject: return Cast(builder)->Finish(std::move(finish_children), out); - case Kind::kNumberOrString: - return FinishScalar(scalar_values, Cast(builder), out); - default: return Status::NotImplemented("invalid builder kind"); } diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index eef117b777da8..452409209c4ce 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -204,6 +204,36 @@ TEST_P(ReaderTest, MultipleChunks) { AssertTablesEqual(*expected_table, *table_); } +TEST_P(ReaderTest, UnquotedDecimal) { + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = unquoted_decimal_src(); + SetUpReader(src); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); + AssertTablesEqual(*expected_table, *table_); +} + +TEST_P(ReaderTest, MixedDecimal) { + auto schema = + ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); + parse_options_.explicit_schema = schema; + auto src = mixed_decimal_src(); + SetUpReader(src); + ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); + + auto expected_table = TableFromJSON(schema, {R"([ + { "price": "30.04", "cost":"30.001" }, + { "price": "1.23", "cost":"1.229" } + ])"}); + AssertTablesEqual(*expected_table, *table_); +} + TEST(ReaderTest, MultipleChunksParallel) { int64_t count = 1 << 10; @@ -275,37 +305,5 @@ TEST(ReaderTest, ListArrayWithFewValues) { AssertTablesEqual(*actual_table, *expected_table); } -TEST_P(ReaderTest, UnquotedDecimal) { - parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - auto schema = - ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); - parse_options_.explicit_schema = schema; - auto src = unquoted_decimal_src(); - SetUpReader(src); - ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); - - auto expected_table = TableFromJSON(schema, {R"([ - { "price": "30.04", "cost":"30.001" }, - { "price": "1.23", "cost":"1.229" } - ])"}); - AssertTablesEqual(*expected_table, *table_); -} - -TEST_P(ReaderTest, MixedDecimal) { - parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - auto schema = - ::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))}); - parse_options_.explicit_schema = schema; - auto src = mixed_decimal_src(); - SetUpReader(src); - ASSERT_OK_AND_ASSIGN(table_, reader_->Read()); - - auto expected_table = TableFromJSON(schema, {R"([ - { "price": "30.04", "cost":"30.001" }, - { "price": "1.23", "cost":"1.229" } - ])"}); - AssertTablesEqual(*expected_table, *table_); -} - } // namespace json } // namespace arrow