Skip to content

Commit

Permalink
Use LargeStringArray for casting when writing tables to CSV
Browse files Browse the repository at this point in the history
Signed-off-by: Tao He <sighingnow@gmail.com>
  • Loading branch information
sighingnow committed Apr 11, 2024
1 parent 06d4495 commit dcf7100
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 19 deletions.
98 changes: 80 additions & 18 deletions cpp/src/arrow/csv/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,20 @@ class ColumnPopulator {
// Populators are intented to be applied to reasonably small data. In most cases
// threading overhead would not be justified.
ctx.set_use_threads(false);
ASSIGN_OR_RAISE(
std::shared_ptr<Array> casted,
compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx));
casted_array_ = checked_pointer_cast<StringArray>(casted);
if (data.type() && is_large_binary_like(data.type()->id())) {
ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(),
compute::CastOptions(), &ctx));
} else {
auto casted = compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx);
if (casted.ok()) {
array_ = std::move(casted).ValueOrDie();
} else if (casted.status().IsCapacityError()) {
ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(),
compute::CastOptions(), &ctx));
} else {
return casted.status();
}
}
return UpdateRowLengths(row_lengths);
}

Expand All @@ -146,7 +156,8 @@ class ColumnPopulator {

protected:
virtual Status UpdateRowLengths(int64_t* row_lengths) = 0;
std::shared_ptr<StringArray> casted_array_;
// It must be a `StringArray` or `LargeStringArray`.
std::shared_ptr<Array> array_;
const std::string end_chars_;
std::shared_ptr<Buffer> null_string_;

Expand Down Expand Up @@ -181,15 +192,28 @@ class UnquotedColumnPopulator : public ColumnPopulator {
reject_values_with_quotes_(reject_values_with_quotes) {}

Status UpdateRowLengths(int64_t* row_lengths) override {
if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) {
return UpdateRowLengths<StringArray>(row_lengths);
} else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) {
return UpdateRowLengths<LargeStringArray>(row_lengths);
} else {
return Status::TypeError("The array must be StringArray or LargeStringArray.");
}
}

template <typename StringArrayType>
Status UpdateRowLengths(int64_t* row_lengths) {
auto casted_array = checked_pointer_cast<StringArrayType>(array_);
if (reject_values_with_quotes_) {
// When working on values that, after casting, could produce quotes,
// we need to return an error in accord with RFC4180.
RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars(*casted_array_, delimiter_));
RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars<StringArrayType>(*casted_array,
delimiter_));
}

int64_t row_number = 0;
VisitArraySpanInline<StringType>(
*casted_array_->data(),
VisitArraySpanInline<typename StringArrayType::TypeClass>(
*casted_array->data(),
[&](std::string_view s) {
row_lengths[row_number] += static_cast<int64_t>(s.length());
row_number++;
Expand All @@ -202,6 +226,17 @@ class UnquotedColumnPopulator : public ColumnPopulator {
}

Status PopulateRows(char* output, int64_t* offsets) const override {
if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) {
return PopulateRows<StringArray>(output, offsets);
} else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) {
return PopulateRows<LargeStringArray>(output, offsets);
} else {
return Status::TypeError("The array must be StringArray or LargeStringArray.");
}
}

template <typename StringArrayType>
Status PopulateRows(char* output, int64_t* offsets) const {
// Function applied to valid values cast to string.
auto valid_function = [&](std::string_view s) {
memcpy(output + *offsets, s.data(), s.length());
Expand All @@ -222,13 +257,15 @@ class UnquotedColumnPopulator : public ColumnPopulator {
return Status::OK();
};

return VisitArraySpanInline<StringType>(*casted_array_->data(), valid_function,
null_function);
auto casted_array = checked_pointer_cast<StringArrayType>(array_);
return VisitArraySpanInline<typename StringArrayType::TypeClass>(
*casted_array->data(), valid_function, null_function);
}

private:
// Returns an error status if string array has any structural characters.
static Status CheckStringArrayHasNoStructuralChars(const StringArray& array,
template <typename ArrayType>
static Status CheckStringArrayHasNoStructuralChars(const ArrayType& array,
const char delimiter) {
// scan the underlying string array buffer as a single big string
const uint8_t* const data = array.raw_data() + array.value_offset(0);
Expand Down Expand Up @@ -282,14 +319,26 @@ class QuotedColumnPopulator : public ColumnPopulator {
: ColumnPopulator(pool, std::move(end_chars), std::move(null_string)) {}

Status UpdateRowLengths(int64_t* row_lengths) override {
const StringArray& input = *casted_array_;
if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) {
return UpdateRowLengths<StringArray>(row_lengths);
} else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) {
return UpdateRowLengths<LargeStringArray>(row_lengths);
} else {
return Status::TypeError("The array must be StringArray or LargeStringArray.");
}
}

template <typename StringArrayType>
Status UpdateRowLengths(int64_t* row_lengths) {
auto casted_array = checked_pointer_cast<StringArrayType>(array_);
const StringArrayType& input = *casted_array;

row_needs_escaping_.resize(casted_array_->length(), false);
row_needs_escaping_.resize(casted_array->length(), false);

if (NoQuoteInArray(input)) {
// fast path if no quote
int row_number = 0;
VisitArraySpanInline<StringType>(
VisitArraySpanInline<typename StringArrayType::TypeClass>(
*input.data(),
[&](std::string_view s) {
row_lengths[row_number] += static_cast<int64_t>(s.length()) + kQuoteCount;
Expand All @@ -301,7 +350,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
});
} else {
int row_number = 0;
VisitArraySpanInline<StringType>(
VisitArraySpanInline<typename StringArrayType::TypeClass>(
*input.data(),
[&](std::string_view s) {
// Each quote in the value string needs to be escaped.
Expand All @@ -320,9 +369,21 @@ class QuotedColumnPopulator : public ColumnPopulator {
}

Status PopulateRows(char* output, int64_t* offsets) const override {
if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) {
return PopulateRows<StringArray>(output, offsets);
} else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) {
return PopulateRows<LargeStringArray>(output, offsets);
} else {
return Status::TypeError("The array must be StringArray or LargeStringArray.");
}
}

template <typename StringArrayType>
Status PopulateRows(char* output, int64_t* offsets) const {
auto needs_escaping = row_needs_escaping_.begin();
VisitArraySpanInline<StringType>(
*(casted_array_->data()),
auto casted_array = checked_pointer_cast<StringArrayType>(array_);
VisitArraySpanInline<typename StringArrayType::TypeClass>(
*(casted_array->data()),
[&](std::string_view s) {
// still needs string content length to be added
char* row = output + *offsets;
Expand Down Expand Up @@ -355,7 +416,8 @@ class QuotedColumnPopulator : public ColumnPopulator {

private:
// Returns true if there's no quote in the string array
static bool NoQuoteInArray(const StringArray& array) {
template <typename StringArrayType>
static bool NoQuoteInArray(const StringArrayType& array) {
const uint8_t* data = array.raw_data() + array.value_offset(0);
const int64_t buffer_size = array.total_values_length();
return std::memchr(data, '"', buffer_size) == nullptr;
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/csv/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ namespace arrow {
namespace csv {

// Functionality for converting Arrow data to Comma separated value text.
// This library supports all primitive types that can be cast to a StringArrays.
// This library supports all primitive types that can be cast to a StringArray or
// a LargeStringArray.
// It applies to following formatting rules:
// - For non-binary types no quotes surround values. Nulls are represented as the empty
// string.
Expand Down

0 comments on commit dcf7100

Please sign in to comment.