Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-11035: [Rust] Improved performance of casting to utf8 #9014

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions rust/arrow/benches/cast_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("cast utf8 to f32", |b| {
b.iter(|| cast_array(&f32_utf8_array, DataType::Float32))
});
c.bench_function("cast i64 to string 512", |b| {
b.iter(|| cast_array(&i64_array, DataType::Utf8))
});

c.bench_function("cast timestamp_ms to i64 512", |b| {
b.iter(|| cast_array(&time_ms_array, DataType::Int64))
Expand Down
69 changes: 24 additions & 45 deletions rust/arrow/src/compute/kernels/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,17 +351,13 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
Float32 => cast_bool_to_numeric::<Float32Type>(array),
Float64 => cast_bool_to_numeric::<Float64Type>(array),
Utf8 => {
let from = array.as_any().downcast_ref::<BooleanArray>().unwrap();
let mut b = StringBuilder::new(array.len());
for i in 0..array.len() {
if array.is_null(i) {
b.append(false)?;
} else {
b.append_value(if from.value(i) { "1" } else { "0" })?;
}
}

Ok(Arc::new(b.finish()) as ArrayRef)
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
Ok(Arc::new(
array
.iter()
.map(|value| value.map(|value| if value { "1" } else { "0" }))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the speed up come here from not using the string builder or is using this iterator also faster?
It looks at least better, so if no difference this is better 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I suspect the builder, because the iterator does the same thing as before atm (i.e. same bound checks).

The builders IMO are inefficient atm. Since IMO they are less idiomatic, I do not see any issue in replacing them whenever we can ^_^

.collect::<StringArray>(),
))
}
_ => Err(ArrowError::ComputeError(format!(
"Casting from {:?} to {:?} not supported",
Expand Down Expand Up @@ -431,20 +427,15 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
Float32 => cast_numeric_to_string::<Float32Type>(array),
Float64 => cast_numeric_to_string::<Float64Type>(array),
Binary => {
let from = array.as_any().downcast_ref::<BinaryArray>().unwrap();
let mut b = StringBuilder::new(array.len());
for i in 0..array.len() {
if array.is_null(i) {
b.append_null()?;
} else {
match str::from_utf8(from.value(i)) {
Ok(s) => b.append_value(s)?,
Err(_) => b.append_null()?, // not valid UTF8
}
}
}

Ok(Arc::new(b.finish()) as ArrayRef)
let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
Ok(Arc::new(
array
.iter()
.map(|maybe_value| {
maybe_value.and_then(|value| str::from_utf8(value).ok())
})
.collect::<StringArray>(),
))
}
_ => Err(ArrowError::ComputeError(format!(
"Casting from {:?} to {:?} not supported",
Expand Down Expand Up @@ -892,31 +883,22 @@ where
FROM: ArrowNumericType,
FROM::Native: std::string::ToString,
{
numeric_to_string_cast::<FROM>(
Ok(Arc::new(numeric_to_string_cast::<FROM>(
array
.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
)
.map(|to| Arc::new(to) as ArrayRef)
)))
}

fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> Result<StringArray>
fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> StringArray
where
T: ArrowPrimitiveType + ArrowNumericType,
T::Native: std::string::ToString,
{
let mut b = StringBuilder::new(from.len());

for i in 0..from.len() {
if from.is_null(i) {
b.append(false)?;
} else {
b.append_value(&from.value(i).to_string())?;
}
}

Ok(b.finish())
from.iter()
.map(|maybe_value| maybe_value.map(|value| value.to_string()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, we probably can use lexical like here later #9010

.collect()
}

/// Cast numeric types to Utf8
Expand Down Expand Up @@ -2714,11 +2696,8 @@ mod tests {
fn test_cast_string_array_to_dict() {
use DataType::*;

let mut builder = StringBuilder::new(10);
builder.append_value("one").unwrap();
builder.append_null().unwrap();
builder.append_value("three").unwrap();
let array: ArrayRef = Arc::new(builder.finish());
let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")]))
as ArrayRef;

let expected = vec!["one", "null", "three"];

Expand Down
15 changes: 4 additions & 11 deletions rust/arrow/src/csv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ use std::sync::Arc;

use csv as csv_crate;

use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringBuilder};
use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray};
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::RecordBatch;
Expand Down Expand Up @@ -449,16 +449,9 @@ fn parse(
&DataType::Date64(_) => {
build_primitive_array::<Date64Type>(line_number, rows, i)
}
&DataType::Utf8 => {
let mut builder = StringBuilder::new(rows.len());
for row in rows.iter() {
match row.get(i) {
Some(s) => builder.append_value(s).unwrap(),
_ => builder.append(false).unwrap(),
}
}
Ok(Arc::new(builder.finish()) as ArrayRef)
}
&DataType::Utf8 => Ok(Arc::new(
rows.iter().map(|row| row.get(i)).collect::<StringArray>(),
) as ArrayRef),
other => Err(ArrowError::ParseError(format!(
"Unsupported data type {:?}",
other
Expand Down
23 changes: 8 additions & 15 deletions rust/arrow/src/json/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1115,21 +1115,14 @@ impl Decoder {
t
))),
},
DataType::Utf8 => {
let mut builder = StringBuilder::new(rows.len());
for row in rows {
if let Some(value) = row.get(field.name()) {
if let Some(str_v) = value.as_str() {
builder.append_value(str_v)?
} else {
builder.append(false)?
}
} else {
builder.append(false)?
}
}
Ok(Arc::new(builder.finish()) as ArrayRef)
}
DataType::Utf8 => Ok(Arc::new(
rows.iter()
.map(|row| {
let maybe_value = row.get(field.name());
maybe_value.and_then(|value| value.as_str())
})
.collect::<StringArray>(),
) as ArrayRef),
DataType::List(ref list_field) => {
match list_field.data_type() {
DataType::Dictionary(ref key_ty, _) => {
Expand Down