Skip to content

Commit

Permalink
ARROW-17847: [C++] Support unquoted decimal in JSON parser
Browse files Browse the repository at this point in the history
  • Loading branch information
js8544 committed Sep 26, 2022
1 parent be30611 commit 91131ad
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 14 deletions.
3 changes: 3 additions & 0 deletions cpp/src/arrow/json/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ struct ARROW_EXPORT ParseOptions {
/// How JSON fields outside of explicit_schema (if given) are treated
UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;

/// Whether decimals are represented as strings(quoted) or numbers(unquoted)
bool parse_decimal_as_number = false;

/// Create parsing options with default values
static ParseOptions Defaults();
};
Expand Down
32 changes: 20 additions & 12 deletions cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <utility>
#include <vector>

#include "arrow/json/options.h"
#include "arrow/json/rapidjson_defs.h"
#include "rapidjson/error/en.h"
#include "rapidjson/reader.h"
Expand Down Expand Up @@ -92,7 +93,8 @@ Kind::type Kind::FromTag(const std::shared_ptr<const KeyValueMetadata>& tag) {
return static_cast<Kind::type>(name_to_kind.Find(name));
}

Status Kind::ForType(const DataType& type, Kind::type* kind) {
Status Kind::ForType(const DataType& type, const ParseOptions& options,
Kind::type* kind) {
struct {
Status Visit(const NullType&) { return SetKind(Kind::kNull); }
Status Visit(const BooleanType&) { return SetKind(Kind::kBoolean); }
Expand All @@ -102,9 +104,12 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) {
Status Visit(const BinaryType&) { return SetKind(Kind::kString); }
Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const TimestampType&) { return SetKind(Kind::kString); }
Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const FixedSizeBinaryType&) {
return options_.parse_decimal_as_number ? SetKind(Kind::kNumber)
: SetKind(Kind::kString);
}
Status Visit(const DictionaryType& dict_type) {
return Kind::ForType(*dict_type.value_type(), kind_);
return Kind::ForType(*dict_type.value_type(), options_, kind_);
}
Status Visit(const ListType&) { return SetKind(Kind::kArray); }
Status Visit(const StructType&) { return SetKind(Kind::kObject); }
Expand All @@ -115,8 +120,9 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) {
*kind_ = kind;
return Status::OK();
}
const ParseOptions& options_;
Kind::type* kind_;
} visitor = {kind};
} visitor = {options, kind};
return VisitTypeInline(type, &visitor);
}

Expand Down Expand Up @@ -393,7 +399,8 @@ class RawArrayBuilder<Kind::kObject> {

class RawBuilderSet {
public:
explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {}
explicit RawBuilderSet(MemoryPool* pool, const ParseOptions& options)
: pool_(pool), options_(options) {}

/// Retrieve a pointer to a builder from a BuilderPtr
template <Kind::type kind>
Expand All @@ -415,7 +422,7 @@ class RawBuilderSet {
/// construct a builder of whatever kind corresponds to a DataType
Status MakeBuilder(const DataType& t, int64_t leading_nulls, BuilderPtr* builder) {
Kind::type kind;
RETURN_NOT_OK(Kind::ForType(t, &kind));
RETURN_NOT_OK(Kind::ForType(t, options_, &kind));
switch (kind) {
case Kind::kNull:
*builder = BuilderPtr(Kind::kNull, static_cast<uint32_t>(leading_nulls), true);
Expand Down Expand Up @@ -565,6 +572,7 @@ class RawBuilderSet {
std::vector<RawArrayBuilder<Kind::kArray>>,
std::vector<RawArrayBuilder<Kind::kObject>>>
arenas_;
const ParseOptions& options_;
};

/// Three implementations are provided for BlockParser, one for each
Expand All @@ -573,9 +581,9 @@ class RawBuilderSet {
class HandlerBase : public BlockParser,
public rj::BaseReaderHandler<rj::UTF8<>, HandlerBase> {
public:
explicit HandlerBase(MemoryPool* pool)
: BlockParser(pool),
builder_set_(pool),
explicit HandlerBase(MemoryPool* pool, const ParseOptions& options)
: BlockParser(pool, options),
builder_set_(pool, options),
field_index_(-1),
scalar_values_builder_(pool) {}

Expand Down Expand Up @@ -1086,15 +1094,15 @@ Status BlockParser::Make(MemoryPool* pool, const ParseOptions& options,

switch (options.unexpected_field_behavior) {
case UnexpectedFieldBehavior::Ignore: {
*out = std::make_unique<Handler<UnexpectedFieldBehavior::Ignore>>(pool);
*out = std::make_unique<Handler<UnexpectedFieldBehavior::Ignore>>(pool, options);
break;
}
case UnexpectedFieldBehavior::Error: {
*out = std::make_unique<Handler<UnexpectedFieldBehavior::Error>>(pool);
*out = std::make_unique<Handler<UnexpectedFieldBehavior::Error>>(pool, options);
break;
}
case UnexpectedFieldBehavior::InferType:
*out = std::make_unique<Handler<UnexpectedFieldBehavior::InferType>>(pool);
*out = std::make_unique<Handler<UnexpectedFieldBehavior::InferType>>(pool, options);
break;
}
return static_cast<HandlerBase&>(**out).Initialize(options.explicit_schema);
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/arrow/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ struct Kind {

static Kind::type FromTag(const std::shared_ptr<const KeyValueMetadata>& tag);

static Status ForType(const DataType& type, Kind::type* kind);
static Status ForType(const DataType& type, const ParseOptions& options,
Kind::type* kind);
};

constexpr int32_t kMaxParserNumRows = 100000;
Expand Down Expand Up @@ -91,10 +92,12 @@ class ARROW_EXPORT BlockParser {
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);

explicit BlockParser(MemoryPool* pool) : pool_(pool) {}
explicit BlockParser(MemoryPool* pool, const ParseOptions& options)
: pool_(pool), options_(options) {}

MemoryPool* pool_;
int32_t num_rows_ = 0;
const ParseOptions& options_;
};

} // namespace json
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/json/test_common.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
Expand Down Expand Up @@ -136,6 +137,25 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) {
"[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
}

TEST(BlockParserWithSchema, UnquotedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
options.parse_decimal_as_number = true;
AssertParseColumns(options, unquoted_decimal_src(),
{field("price", utf8()), field("cost", utf8())},
{R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"});
}

TEST(BlockParserWithSchema, MixedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 2))});
options.parse_decimal_as_number = true;
std::shared_ptr<arrow::Array> parsed;
ASSERT_RAISES(Invalid, ParseFromString(options, mixed_decimal_src(), &parsed));
}

class BlockParserTypeError : public ::testing::TestWithParam<UnexpectedFieldBehavior> {
public:
ParseOptions Options(std::shared_ptr<Schema> explicit_schema) {
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/json/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/json/test_common.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"

namespace arrow {
namespace json {
Expand Down Expand Up @@ -274,5 +275,32 @@ TEST(ReaderTest, ListArrayWithFewValues) {
AssertTablesEqual(*actual_table, *expected_table);
}

TEST_P(ReaderTest, UnquotedDecimal) {
parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
parse_options_.parse_decimal_as_number = true;
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = unquoted_decimal_src();
SetUpReader(src);
ASSERT_OK_AND_ASSIGN(table_, reader_->Read());

auto expected_table = Table::Make(
schema, {ArrayFromJSON(schema->field(0)->type(), R"(["30.04", "1.23"])"),
ArrayFromJSON(schema->field(1)->type(), R"(["30.001", "1.229"])")});
AssertTablesEqual(*expected_table, *table_);
}

TEST_P(ReaderTest, MixedDecimal) {
parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
parse_options_.parse_decimal_as_number = true;
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = mixed_decimal_src();
SetUpReader(src);
ASSERT_RAISES(Invalid, reader_->Read());
}

} // namespace json
} // namespace arrow
14 changes: 14 additions & 0 deletions cpp/src/arrow/json/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,19 @@ inline static std::string null_src() {
)";
}

inline static std::string unquoted_decimal_src() {
return R"(
{ "price": 30.04, "cost":30.001 }
{ "price": 1.23, "cost":1.229 }
)";
}

inline static std::string mixed_decimal_src() {
return R"(
{ "price": 30.04, "cost": "30.001" }
{ "price": 1.23, "cost": "1.229" }
)";
}

} // namespace json
} // namespace arrow

0 comments on commit 91131ad

Please sign in to comment.