Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-17847: [C++] Support unquoted decimal in JSON parser #14242

Merged
merged 4 commits into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 34 additions & 7 deletions cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ static Status ParseError(T&&... t) {
}

const std::string& Kind::Name(Kind::type kind) {
static const std::string names[] = {"null", "boolean", "number",
"string", "array", "object"};
static const std::string names[] = {
"null", "boolean", "number", "string", "array", "object", "number_or_string",
};

return names[kind];
}
Expand All @@ -69,14 +70,15 @@ const std::shared_ptr<const KeyValueMetadata>& Kind::Tag(Kind::type kind) {
key_value_metadata({{"json_kind", Kind::Name(Kind::kString)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kArray)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kObject)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kNumberOrString)}}),
};
return tags[kind];
}

static arrow::internal::Trie MakeFromTagTrie() {
arrow::internal::TrieBuilder builder;
for (auto kind : {Kind::kNull, Kind::kBoolean, Kind::kNumber, Kind::kString,
Kind::kArray, Kind::kObject}) {
Kind::kArray, Kind::kObject, Kind::kNumberOrString}) {
DCHECK_OK(builder.Append(Kind::Name(kind)));
}
auto name_to_kind = builder.Finish();
Expand All @@ -102,7 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) {
Status Visit(const BinaryType&) { return SetKind(Kind::kString); }
Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const TimestampType&) { return SetKind(Kind::kString); }
Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kNumberOrString); }
js8544 marked this conversation as resolved.
Show resolved Hide resolved
Status Visit(const DictionaryType& dict_type) {
return Kind::ForType(*dict_type.value_type(), kind_);
}
Expand Down Expand Up @@ -391,6 +393,12 @@ class RawArrayBuilder<Kind::kObject> {
TypedBufferBuilder<bool> null_bitmap_builder_;
};

template <>
class RawArrayBuilder<Kind::kNumberOrString> : public ScalarBuilder {
public:
using ScalarBuilder::ScalarBuilder;
};

class RawBuilderSet {
public:
explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {}
Expand Down Expand Up @@ -454,6 +462,8 @@ class RawBuilderSet {
}
return Status::OK();
}
case Kind::kNumberOrString:
return MakeBuilder<Kind::kNumberOrString>(leading_nulls, builder);
js8544 marked this conversation as resolved.
Show resolved Hide resolved
default:
return Status::NotImplemented("invalid builder type");
}
Expand Down Expand Up @@ -504,6 +514,9 @@ class RawBuilderSet {
}
return Status::OK();
}
case Kind::kNumberOrString: {
return Cast<Kind::kNumberOrString>(builder)->AppendNull();
js8544 marked this conversation as resolved.
Show resolved Hide resolved
}
default:
return Status::NotImplemented("invalid builder Kind");
}
Expand Down Expand Up @@ -536,6 +549,9 @@ class RawBuilderSet {
case Kind::kObject:
return Cast<Kind::kObject>(builder)->Finish(std::move(finish_children), out);

case Kind::kNumberOrString:
return FinishScalar(scalar_values, Cast<Kind::kNumberOrString>(builder), out);
js8544 marked this conversation as resolved.
Show resolved Hide resolved

default:
return Status::NotImplemented("invalid builder kind");
}
Expand Down Expand Up @@ -563,7 +579,8 @@ class RawBuilderSet {
std::vector<RawArrayBuilder<Kind::kNumber>>,
std::vector<RawArrayBuilder<Kind::kString>>,
std::vector<RawArrayBuilder<Kind::kArray>>,
std::vector<RawArrayBuilder<Kind::kObject>>>
std::vector<RawArrayBuilder<Kind::kObject>>,
std::vector<RawArrayBuilder<Kind::kNumberOrString>>>
arenas_;
};

Expand Down Expand Up @@ -610,12 +627,22 @@ class HandlerBase : public BlockParser,
}

bool RawNumber(const char* data, rj::SizeType size, ...) {
status_ = AppendScalar<Kind::kNumber>(builder_, std::string_view(data, size));
if (builder_.kind == Kind::kNumberOrString) {
status_ =
AppendScalar<Kind::kNumberOrString>(builder_, std::string_view(data, size));
} else {
pitrou marked this conversation as resolved.
Show resolved Hide resolved
status_ = AppendScalar<Kind::kNumber>(builder_, std::string_view(data, size));
}
return status_.ok();
}

bool String(const char* data, rj::SizeType size, ...) {
status_ = AppendScalar<Kind::kString>(builder_, std::string_view(data, size));
if (builder_.kind == Kind::kNumberOrString) {
status_ =
AppendScalar<Kind::kNumberOrString>(builder_, std::string_view(data, size));
} else {
status_ = AppendScalar<Kind::kString>(builder_, std::string_view(data, size));
}
return status_.ok();
}

Expand Down
10 changes: 9 additions & 1 deletion cpp/src/arrow/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,15 @@ class ResizableBuffer;
namespace json {

struct Kind {
enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject };
enum type : uint8_t {
kNull,
kBoolean,
kNumber,
kString,
kArray,
kObject,
kNumberOrString
};

static const std::string& Name(Kind::type);

Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/json/test_common.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
Expand Down Expand Up @@ -136,6 +137,24 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) {
"[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
}

TEST(BlockParserWithSchema, UnquotedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
AssertParseColumns(options, unquoted_decimal_src(),
{field("price", utf8()), field("cost", utf8())},
{R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"});
}

TEST(BlockParserWithSchema, MixedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
AssertParseColumns(options, mixed_decimal_src(),
{field("price", utf8()), field("cost", utf8())},
{R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"});
}

class BlockParserTypeError : public ::testing::TestWithParam<UnexpectedFieldBehavior> {
public:
ParseOptions Options(std::shared_ptr<Schema> explicit_schema) {
Expand Down
33 changes: 33 additions & 0 deletions cpp/src/arrow/json/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/json/test_common.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"

namespace arrow {
namespace json {
Expand Down Expand Up @@ -274,5 +275,37 @@ TEST(ReaderTest, ListArrayWithFewValues) {
AssertTablesEqual(*actual_table, *expected_table);
}

TEST_P(ReaderTest, UnquotedDecimal) {
js8544 marked this conversation as resolved.
Show resolved Hide resolved
parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
js8544 marked this conversation as resolved.
Show resolved Hide resolved
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = unquoted_decimal_src();
SetUpReader(src);
ASSERT_OK_AND_ASSIGN(table_, reader_->Read());

auto expected_table = TableFromJSON(schema, {R"([
{ "price": "30.04", "cost":"30.001" },
{ "price": "1.23", "cost":"1.229" }
])"});
AssertTablesEqual(*expected_table, *table_);
}

TEST_P(ReaderTest, MixedDecimal) {
parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = mixed_decimal_src();
SetUpReader(src);
ASSERT_OK_AND_ASSIGN(table_, reader_->Read());

auto expected_table = TableFromJSON(schema, {R"([
{ "price": "30.04", "cost":"30.001" },
{ "price": "1.23", "cost":"1.229" }
])"});
AssertTablesEqual(*expected_table, *table_);
}

} // namespace json
} // namespace arrow
14 changes: 14 additions & 0 deletions cpp/src/arrow/json/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,19 @@ inline static std::string null_src() {
)";
}

inline static std::string unquoted_decimal_src() {
return R"(
{ "price": 30.04, "cost":30.001 }
{ "price": 1.23, "cost":1.229 }
)";
}

inline static std::string mixed_decimal_src() {
return R"(
{ "price": 30.04, "cost": 30.001 }
{ "price": "1.23", "cost": "1.229" }
)";
}

} // namespace json
} // namespace arrow