Skip to content

Commit

Permalink
Use Automaton for SurroundQuery prefix/pattern matching (#12999)
Browse files Browse the repository at this point in the history
(cherry picked from commit 89a02fa)
  • Loading branch information
magibney committed Jan 10, 2024
1 parent 024b2bd commit 86ccef0
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 104 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -33,6 +33,8 @@ Improvements

* GITHUB#12910: Refactor around NeighborArray to make it more self-contained. (Patrick Zhai)

* GITHUB#12999: Use Automaton for SurroundQuery prefix/pattern matching (Michael Gibney)

Optimizations
---------------------

Expand Down
@@ -1,7 +1,7 @@
{
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/ParseException.java": "d8b3e605b4bfb01697df5ce246e84fa2b691fb4f",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "e79256ffc3859ac60deca6957ce742c13c1e5649",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "21b38627431747c741e2ec24be1e7aef38dc70c9",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "f07a1c6a54c544a01c1ba19dd468c2c0a86cb9d8",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "2a288b7c933ab757c781890c41bea5e5c4fa3b49",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserConstants.java": "8feb77878890c27e874be457d839eba48192c40f",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserTokenManager.java": "959523aec4e49f9665e39f16e1da335aab3632d1",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/Token.java": "a5eea2a3043e0aa2781f4a43b9ab9c5d59add80e",
Expand Down
Expand Up @@ -151,7 +151,7 @@ protected boolean allowedTruncation(String truncated) {
}

protected SrndQuery getTruncQuery(String truncated) {
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
return new SrndTruncQuery(truncated);
}

final public SrndQuery TopSrndQuery() throws ParseException {SrndQuery q;
Expand Down
Expand Up @@ -179,7 +179,7 @@ public class QueryParser {
}

protected SrndQuery getTruncQuery(String truncated) {
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
return new SrndTruncQuery(truncated);
}
}

Expand Down
Expand Up @@ -22,17 +22,19 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;

/** Query that matches String prefixes */
public class SrndPrefixQuery extends SimpleTerm {
private final BytesRef prefixRef;
private final CompiledAutomaton compiled;

public SrndPrefixQuery(String prefix, boolean quoted, char truncator) {
super(quoted);
this.prefix = prefix;
prefixRef = new BytesRef(prefix);
compiled =
new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix)), true, true, true);
this.truncator = truncator;
}

Expand All @@ -48,10 +50,6 @@ public char getSuffixOperator() {
return truncator;
}

public Term getLucenePrefixTerm(String fieldName) {
return new Term(fieldName, getPrefix());
}

@Override
public String toStringUnquoted() {
return getPrefix();
Expand All @@ -65,35 +63,13 @@ protected void suffixToString(StringBuilder r) {
@Override
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
throws IOException {
/* inspired by PrefixQuery.rewrite(): */
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();

boolean skip = false;
TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix()));
if (status == TermsEnum.SeekStatus.FOUND) {
mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
if (StringHelper.startsWith(termsEnum.term(), prefixRef)) {
mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
} else {
skip = true;
}
} else {
// EOF
skip = true;
}
TermsEnum termsEnum = compiled.getTermsEnum(terms);

if (!skip) {
while (true) {
BytesRef text = termsEnum.next();
if (text != null && StringHelper.startsWith(text, prefixRef)) {
mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
} else {
break;
}
}
BytesRef br;
while ((br = termsEnum.next()) != null) {
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
}
}
}
Expand Down
Expand Up @@ -17,33 +17,32 @@
package org.apache.lucene.queryparser.surround.query;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;

/** Query that matches wildcards */
public class SrndTruncQuery extends SimpleTerm {
public SrndTruncQuery(String truncated, char unlimited, char mask) {
public SrndTruncQuery(String truncated) {
super(false); /* not quoted */
this.truncated = truncated;
this.unlimited = unlimited;
this.mask = mask;
truncatedToPrefixAndPattern();
compiled =
new CompiledAutomaton(
WildcardQuery.toAutomaton(
new Term(null, truncated), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT),
false,
true,
true);
}

private final String truncated;
private final char unlimited;
private final char mask;

private String prefix;
private BytesRef prefixRef;
private Pattern pattern;
private final CompiledAutomaton compiled;

public String getTruncated() {
return truncated;
Expand All @@ -54,66 +53,16 @@ public String toStringUnquoted() {
return getTruncated();
}

protected boolean matchingChar(char c) {
return (c != unlimited) && (c != mask);
}

protected void appendRegExpForChar(char c, StringBuilder re) {
if (c == unlimited) re.append(".*");
else if (c == mask) re.append(".");
else re.append(c);
}

protected void truncatedToPrefixAndPattern() {
int i = 0;
while ((i < truncated.length()) && matchingChar(truncated.charAt(i))) {
i++;
}
prefix = truncated.substring(0, i);
prefixRef = new BytesRef(prefix);

StringBuilder re = new StringBuilder();
while (i < truncated.length()) {
appendRegExpForChar(truncated.charAt(i), re);
i++;
}
pattern = Pattern.compile(re.toString());
}

@Override
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
throws IOException {
int prefixLength = prefix.length();
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms != null) {
Matcher matcher = pattern.matcher("");
try {
TermsEnum termsEnum = terms.iterator();

TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
BytesRef text;
if (status == TermsEnum.SeekStatus.FOUND) {
text = prefixRef;
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
text = termsEnum.term();
} else {
text = null;
}
TermsEnum termsEnum = compiled.getTermsEnum(terms);

while (text != null) {
if (StringHelper.startsWith(text, prefixRef)) {
String textString = text.utf8ToString();
matcher.reset(textString.substring(prefixLength));
if (matcher.matches()) {
mtv.visitMatchingTerm(new Term(fieldName, textString));
}
} else {
break;
}
text = termsEnum.next();
}
} finally {
matcher.reset();
BytesRef br;
while ((br = termsEnum.next()) != null) {
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
}
}
}
Expand Down

0 comments on commit 86ccef0

Please sign in to comment.