Skip to content
Permalink
Browse files
OAK-9691 Improve fulltext query syntax support for ElasticSearch (#489)
  • Loading branch information
FrancoisZhang committed Feb 11, 2022
1 parent 819db78 commit 3fffce60331a922084837e9924c139d7eb5496af
Showing 4 changed files with 77 additions and 81 deletions.
@@ -52,7 +52,6 @@
import org.apache.jackrabbit.oak.spi.mount.Mounts;
import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;
import org.apache.jackrabbit.oak.spi.security.OpenSecurityProvider;
import org.junit.Ignore;
import org.junit.Test;

import ch.qos.logback.classic.Level;
@@ -134,27 +133,6 @@ public void testIndexingDynamicBoostMissingProperty() throws Exception {
assertEquals("[]", log);
}

// verifying fulltext query syntax ability, put here for comparison with dynamic boost and dynamic boost lite
@Test
public void testQueryFullTextWithTitle() throws Exception {
createAssetsIndexAndProperties(false, false);
prepareTestAssets();

assertQuery("//element(*, dam:Asset)[jcr:contains(., 'titleone')]", XPATH, Arrays.asList("/test/asset1"));
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
// case insensitive
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'LONG')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
// wildcard works
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'title*')]", XPATH,
Arrays.asList("/test/asset1", "/test/asset2", "/test/asset3"));
// space is treated as AND
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long titleone')]", XPATH, Arrays.asList("/test/asset1"));
// OR works
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long OR titleone')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
// minus works
assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long -titleone')]", XPATH, Arrays.asList("/test/asset2"));
}

@Test
public void testQueryDynamicBoostBasic() throws Exception {
createAssetsIndexAndProperties(false, false);
@@ -224,7 +202,12 @@ public void testQueryDynamicBoostLiteWildcard() throws Exception {
createAssetsIndexAndProperties(true, true);
prepareTestAssets();

assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'blu*')", SQL2, Arrays.asList("/test/asset3"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'blu?')", SQL2, Arrays.asList("/test/asset3"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'bl?e')", SQL2, Arrays.asList("/test/asset3"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, '?lue')", SQL2, Arrays.asList("/test/asset3"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'coff*')", SQL2, Arrays.asList("/test/asset2"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'co*ee')", SQL2, Arrays.asList("/test/asset2"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, '*ffee')", SQL2, Arrays.asList("/test/asset2"));
}

@Test
@@ -16,12 +16,29 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.query;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.BiPredicate;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import javax.jcr.PropertyType;

import org.apache.http.entity.ContentType;
import org.apache.http.nio.entity.NByteArrayEntity;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
@@ -34,7 +51,6 @@
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.PlanResult;
import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
import org.apache.jackrabbit.oak.spi.query.Filter;
import org.apache.jackrabbit.oak.spi.query.QueryConstants;
import org.apache.jackrabbit.oak.spi.query.QueryIndex;
@@ -48,13 +64,10 @@
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.common.Strings;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.client.Request;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.InnerHitBuilder;
import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
@@ -66,6 +79,7 @@
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.WrapperQueryBuilder;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.elasticsearch.search.aggregations.AggregationBuilders;
@@ -77,26 +91,14 @@
import org.elasticsearch.search.suggest.SuggestBuilders;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder;
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.jcr.PropertyType;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.BiPredicate;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
@@ -113,19 +115,17 @@
import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_PATH;
import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_SCORE;
import static org.apache.jackrabbit.util.ISO8601.parse;
import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;
import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
import static org.elasticsearch.index.query.QueryBuilders.simpleQueryStringQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.elasticsearch.index.query.QueryBuilders.wrapperQuery;
import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;

/**
* Class to map query plans into Elastic request objects.
@@ -779,41 +779,13 @@ private static QueryBuilder referenceConstraint(String uuid) {
}

private static QueryBuilder fullTextQuery(String text, String fieldName, PlanResult pr) {
// default match query are executed in OR, we need to use AND instead to avoid that
// every document having at least one term in the `text` will match. If there are multiple
// contains clause they will go to different match queries and will be executed in OR
if (FieldNames.FULLTEXT.equals(fieldName) && !pr.indexingRule.getNodeScopeAnalyzedProps().isEmpty()) {
MultiMatchQueryBuilder multiMatchQuery = multiMatchQuery(text)
.operator(Operator.AND)
.type(MultiMatchQueryBuilder.Type.CROSS_FIELDS);
pr.indexingRule.getNodeScopeAnalyzedProps().forEach(pd -> multiMatchQuery.field(pd.name, pd.boost));
// Add the query for actual fulltext field also. That query would not be boosted
// and could contain other parts like renditions, node name, etc
return multiMatchQuery.field(fieldName);
} else {
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
// simpleQueryStringQuery does not support leading wildcards whereas it's supported by default in queryStringQuery
// Not using queryStringQuery by default , since some functional cases break.
// simpleQueryStringQuery is less Strict, for instance searches for terms starting with / work, whereas
// with queryStringQuery, they throw an Exception (which ultimately results in an empty result set in oak),
// so using simpleQueryStringQuery by default would break certain functional cases.
// So only support this in case any term in the text String actually starts with *
// For example *hello or Hello *world
String[] textTerms = text.split(" ");
boolean allowLeadingWildCard = false;
for(String textTerm : textTerms) {
if (textTerm.startsWith("*")) {
allowLeadingWildCard = true;
break;
}
}

if (allowLeadingWildCard) {
return queryStringQuery(text).field(fieldName).defaultOperator(Operator.AND);
} else {
return simpleQueryStringQuery(text).analyzeWildcard(true).field(fieldName).defaultOperator(Operator.AND);
}
LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text, fieldName);
QueryStringQueryBuilder queryStringQueryBuilder = queryStringQuery(FulltextIndex.rewriteQueryText(text)).defaultOperator(
Operator.AND).type(MultiMatchQueryBuilder.Type.CROSS_FIELDS);
if (FieldNames.FULLTEXT.equals(fieldName)) {
pr.indexingRule.getNodeScopeAnalyzedProps().forEach(pd -> queryStringQueryBuilder.field(pd.name, pd.boost));
}
return queryStringQueryBuilder.field(fieldName);
}

private QueryBuilder createQuery(String propertyName, Filter.PropertyRestriction pr,
@@ -288,7 +288,7 @@ protected static PlanResult getPlanResult(IndexPlan plan) {
/**
* Following logic is taken from org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser#parse(java.lang.String)
*/
protected static String rewriteQueryText(String textsearch) {
public static String rewriteQueryText(String textsearch) {
// replace escaped ' with just '
StringBuilder rewritten = new StringBuilder();
// most query parsers recognize 'AND' and 'NOT' as
@@ -16,6 +16,7 @@
*/
package org.apache.jackrabbit.oak.plugins.index;

import org.apache.jackrabbit.oak.api.CommitFailedException;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
@@ -526,6 +527,46 @@ public void fullTextQueryTestAllowLeadingWildcards2() throws Exception {
});
}

@Test
public void fullTextQueryGeneric() throws Exception {
Tree test = root.getTree("/").addChild("test");

Tree testNodeA = test.addChild("nodea");
testNodeA.setProperty("a", "hello");
testNodeA.setProperty("b", "ocean");

Tree testNodeB = test.addChild("nodeb");
testNodeB.setProperty("a", "hello world");
testNodeB.setProperty("b", "soccer-shoe");

Tree testNodeC = test.addChild("nodec");
testNodeC.setProperty("a", "hello");
testNodeC.setProperty("b", "world");
root.commit();

assertEventually(() -> {
// case insensitive
assertQuery("//*[jcr:contains(., 'WORLD')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));

// wild card
assertQuery("//*[jcr:contains(., 'Hell*')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
assertQuery("//*[jcr:contains(., 'He*o')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
assertQuery("//*[jcr:contains(., '*llo')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
assertQuery("//*[jcr:contains(., '?orld')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
assertQuery("//*[jcr:contains(., 'wo?ld')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
assertQuery("//*[jcr:contains(., 'worl?')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));

// space explained as AND
assertQuery("//*[jcr:contains(., 'hello world')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));

// exclude
assertQuery("//*[jcr:contains(., 'hello -world')] ", XPATH, Arrays.asList("/test/nodea"));

// explicit OR
assertQuery("//*[jcr:contains(., 'ocean OR world')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
});
}

@Test
public void testInequalityQuery_native() throws Exception {

0 comments on commit 3fffce6

Please sign in to comment.