fix: substring with negative start index#4017
Conversation
| let result = DictionaryArray::try_new(dict.keys().clone(), values)?; | ||
| Ok(Arc::new(result) as ArrayRef) | ||
| } | ||
| _ => Ok(Arc::clone(array)), |
There was a problem hiding this comment.
Should this be an error rather than just returning the input data?
There was a problem hiding this comment.
Thank you, updated
| Ok(Arc::new(builder.finish()) as ArrayRef) | ||
| } | ||
| DataType::Dictionary(_, _) => { | ||
| let dict = as_dictionary_array::<Int32Type>(array); |
There was a problem hiding this comment.
This would panic for a dictionary with Int64Type. Can we add a check for the type?
There was a problem hiding this comment.
Parquet dictionary uses Integer, so we are not doing Int64Type including other locations. E.g.https://github.com/apache/datafusion-comet/blob/main/native/spark-expr/src/static_invoke/char_varchar_utils/read_side_padding.rs#L68
| fn spark_substr_negative(s: &str, pos: i64, len: u64) -> String { | ||
| let num_chars = s.chars().count() as i64; | ||
| let start = num_chars + pos; | ||
| let end = start.saturating_add(len as i64).min(num_chars); | ||
| let start = start.max(0); | ||
|
|
||
| if start >= end { | ||
| return String::new(); | ||
| } | ||
|
|
||
| s.chars() | ||
| .skip(start as usize) | ||
| .take((end - start) as usize) | ||
| .collect() | ||
| } |
There was a problem hiding this comment.
Claude recommended an optimized version to avoid an intermediate string allocation per row. I have not verified.
fn spark_substr_negative(s: &str, pos: i64, len: u64) -> &str {
let num_chars = s.chars().count() as i64;
let end = (num_chars + pos).saturating_add(len as i64).min(num_chars);
let start = (num_chars + pos).max(0);
if start >= end {
return "";
}
// Translate char indices [start, end) to byte offsets in a single forward pass.
let mut it = s.char_indices();
let byte_start = it.by_ref().nth(start as usize).map(|(b, _)| b).unwrap_or(s.len());
let span = (end - start - 1) as usize;
let byte_end = it.nth(span).map(|(b, _)| b).unwrap_or(s.len());
&s[byte_start..byte_end]
} | SELECT substring('こんにちは世界', -2) | ||
|
|
||
| query | ||
| SELECT substring('🎉🎊🎈🎁', 2, 2) |
comphead
left a comment
There was a problem hiding this comment.
Thanks @kazuyukitanimura I think we also would need to donate this later to DF
andygrove
left a comment
There was a problem hiding this comment.
LGTM. Thanks @kazuyukitanimura.
nit: there is some overlap between the Scala and SQL tests, and the SQL tests may have been sufficient alone, but we can review laster
Which issue does this PR close?
Closes #3919
Closes #3337
Rationale for this change
What changes are included in this PR?
How are these changes tested?