Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx%' #4999

Merged
merged 9 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions ql/src/test/queries/clientpositive/like_control_characters.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.vectorized.execution.enabled=true;

create temporary table foo (col string) stored as orc;
create temporary table bar (col binary) stored as orc;

-- SORT_QUERY_RESULTS

INSERT INTO bar select unhex('6162636465-166676869');
ryukobayashi marked this conversation as resolved.
Show resolved Hide resolved
INSERT INTO foo SELECT col FROM bar;

explain select col, count(*) from foo where col like '%fg%' group by col;
select col, count(*) from foo where col like '%fg%' group by col;

111 changes: 111 additions & 0 deletions ql/src/test/results/clientpositive/llap/like_control_characters.q.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
PREHOOK: query: create temporary table foo (col string) stored as orc
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@foo
POSTHOOK: query: create temporary table foo (col string) stored as orc
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@foo
PREHOOK: query: create temporary table bar (col binary) stored as orc
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@bar
POSTHOOK: query: create temporary table bar (col binary) stored as orc
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@bar
PREHOOK: query: INSERT INTO bar select unhex('6162636465-166676869')
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@bar
POSTHOOK: query: INSERT INTO bar select unhex('6162636465-166676869')
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@bar
POSTHOOK: Lineage: bar.col SIMPLE []
PREHOOK: query: INSERT INTO foo SELECT col FROM bar
PREHOOK: type: QUERY
PREHOOK: Input: default@bar
PREHOOK: Output: default@foo
POSTHOOK: query: INSERT INTO foo SELECT col FROM bar
POSTHOOK: type: QUERY
POSTHOOK: Input: default@bar
POSTHOOK: Output: default@foo
POSTHOOK: Lineage: foo.col EXPRESSION [(bar)bar.FieldSchema(name:col, type:binary, comment:null), ]
PREHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col
PREHOOK: type: QUERY
PREHOOK: Input: default@foo
#### A masked pattern was here ####
POSTHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col
POSTHOOK: type: QUERY
POSTHOOK: Input: default@foo
#### A masked pattern was here ####
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1

STAGE PLANS:
Stage: Stage-1
Tez
#### A masked pattern was here ####
Edges:
Reducer 2 <- Map 1 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
Map Operator Tree:
TableScan
alias: foo
filterExpr: (col like '%fg%') (type: boolean)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: (col like '%fg%') (type: boolean)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count()
keys: col (type: string)
minReductionHashAggr: 0.99
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
value expressions: _col1 (type: bigint)
Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
aggregations: count(VALUE._col0)
keys: KEY._col0 (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink

PREHOOK: query: select col, count(*) from foo where col like '%fg%' group by col
PREHOOK: type: QUERY
PREHOOK: Input: default@foo
#### A masked pattern was here ####
POSTHOOK: query: select col, count(*) from foo where col like '%fg%' group by col
POSTHOOK: type: QUERY
POSTHOOK: Input: default@foo
#### A masked pattern was here ####
abcde�fghi 1
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,15 @@ public int find(byte[] input, int start, int len) {
}
s_tmp--;
}
next += shift[input[next] & MAX_BYTE];

// if the character string contains control characters,
// overflow occurs.
int shiftIndex = input[next] & MAX_BYTE;
if (shiftIndex >= MAX_BYTE) {
next++;
} else {
next += shift[shiftIndex];
}
}
return -1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.junit.Test;

import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;

import static org.junit.Assert.*;
zhangbutao marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -49,6 +50,24 @@ public void test() throws Exception {
assertEquals("Testing match at end of string", 24, find(pattern, input4));
}

@Test
public void testControlCharacters() throws Exception {
StringExpr.Finder pattern = compile("pattern");
assertNotNull(pattern);

byte b = -1;
byte[] controlBytes1 = "abcedf".getBytes(StandardCharsets.UTF_8);
byte[] controlBytes2 = "pattern".getBytes(StandardCharsets.UTF_8);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
outputStream.write(controlBytes1);
outputStream.write(b);
outputStream.write(controlBytes2);
byte[] controlChar = outputStream.toByteArray();
outputStream.close();

assertEquals("Testing valid match", 7, pattern.find(controlChar, 0, controlChar.length));
}

private StringExpr.Finder compile(String pattern) {
return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}
Expand Down