Skip to content

Commit

Permalink
Use LargeStringArray for casting when writing tables to CSV
Browse files Browse the repository at this point in the history
Signed-off-by: Tao He <sighingnow@gmail.com>
  • Loading branch information
sighingnow committed Feb 28, 2024
1 parent 5ce060a commit da7c8cb
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions cpp/src/arrow/csv/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ class ColumnPopulator {
ctx.set_use_threads(false);
ASSIGN_OR_RAISE(
std::shared_ptr<Array> casted,
compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx));
casted_array_ = checked_pointer_cast<StringArray>(casted);
compute::Cast(data, /*to_type=*/large_utf8(), compute::CastOptions(), &ctx));
casted_array_ = checked_pointer_cast<LargeStringArray>(casted);
return UpdateRowLengths(row_lengths);
}

Expand All @@ -146,7 +146,7 @@ class ColumnPopulator {

protected:
virtual Status UpdateRowLengths(int64_t* row_lengths) = 0;
std::shared_ptr<StringArray> casted_array_;
std::shared_ptr<LargeStringArray> casted_array_;
const std::string end_chars_;
std::shared_ptr<Buffer> null_string_;

Expand Down Expand Up @@ -228,7 +228,7 @@ class UnquotedColumnPopulator : public ColumnPopulator {

private:
// Returns an error status if string array has any structural characters.
static Status CheckStringArrayHasNoStructuralChars(const StringArray& array,
static Status CheckStringArrayHasNoStructuralChars(const LargeStringArray& array,
const char delimiter) {
// scan the underlying string array buffer as a single big string
const uint8_t* const data = array.raw_data() + array.value_offset(0);
Expand Down Expand Up @@ -282,14 +282,14 @@ class QuotedColumnPopulator : public ColumnPopulator {
: ColumnPopulator(pool, std::move(end_chars), std::move(null_string)) {}

Status UpdateRowLengths(int64_t* row_lengths) override {
const StringArray& input = *casted_array_;
const LargeStringArray& input = *casted_array_;

row_needs_escaping_.resize(casted_array_->length(), false);

if (NoQuoteInArray(input)) {
// fast path if no quote
int row_number = 0;
VisitArraySpanInline<StringType>(
VisitArraySpanInline<LargeStringType>(
*input.data(),
[&](std::string_view s) {
row_lengths[row_number] += static_cast<int64_t>(s.length()) + kQuoteCount;
Expand All @@ -301,7 +301,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
});
} else {
int row_number = 0;
VisitArraySpanInline<StringType>(
VisitArraySpanInline<LargeStringType>(
*input.data(),
[&](std::string_view s) {
// Each quote in the value string needs to be escaped.
Expand All @@ -321,7 +321,7 @@ class QuotedColumnPopulator : public ColumnPopulator {

Status PopulateRows(char* output, int64_t* offsets) const override {
auto needs_escaping = row_needs_escaping_.begin();
VisitArraySpanInline<StringType>(
VisitArraySpanInline<LargeStringType>(
*(casted_array_->data()),
[&](std::string_view s) {
// still needs string content length to be added
Expand Down Expand Up @@ -355,7 +355,7 @@ class QuotedColumnPopulator : public ColumnPopulator {

private:
// Returns true if there's no quote in the string array
static bool NoQuoteInArray(const StringArray& array) {
static bool NoQuoteInArray(const LargeStringArray& array) {
const uint8_t* data = array.raw_data() + array.value_offset(0);
const int64_t buffer_size = array.total_values_length();
return std::memchr(data, '"', buffer_size) == nullptr;
Expand Down

0 comments on commit da7c8cb

Please sign in to comment.