Skip to content
Permalink
Browse files
Merge pull request #573 from apache/OAK-9772
OAK-9772 Elastic and Lucene tokenizer difference
  • Loading branch information
thomasmueller committed May 19, 2022
2 parents fa48a1d + 0bc5457 commit 26cf3f2a0df10e90645ca22c387dbe1a74557880
Showing 8 changed files with 207 additions and 33 deletions.
@@ -40,7 +40,7 @@ however there are differences:
* `functionName` is ignored.
* `name` is ignored.
* `indexPath` is ignored.
* `analyzers` is ignored.
* `analyzers` is ignored, except for `indexOriginalTerm`.
* For property definitions, `sync` and `unique` are ignored.
* The behavior for `dynamicBoost` is slightly different:
For Lucene indexes, boosting is done in indexing, while for Elastic it is done at query time.
@@ -32,7 +32,7 @@ public class OakAnalyzer extends Analyzer {

private final Version matchVersion;

private final int INDEX_ORIGINAL_TERM;
private final int preserveOriginal;

/**
* Creates a new {@link OakAnalyzer}
@@ -55,7 +55,7 @@ public OakAnalyzer(Version matchVersion) {
*/
public OakAnalyzer(Version matchVersion, boolean indexOriginalTerm) {
this.matchVersion = matchVersion;
INDEX_ORIGINAL_TERM = indexOriginalTerm?WordDelimiterFilter.PRESERVE_ORIGINAL:0;
preserveOriginal = indexOriginalTerm ? WordDelimiterFilter.PRESERVE_ORIGINAL : 0;
}

@Override
@@ -66,7 +66,7 @@ protected TokenStreamComponents createComponents(final String fieldName,
tok = new WordDelimiterFilter(tok,
WordDelimiterFilter.GENERATE_WORD_PARTS
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE
| this.INDEX_ORIGINAL_TERM
| preserveOriginal
| WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
return new TokenStreamComponents(src, tok);
}
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.lucene;

import org.apache.jackrabbit.oak.api.ContentRepository;
import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
import org.apache.jackrabbit.oak.plugins.index.LuceneIndexOptions;
import org.junit.After;
import org.junit.Rule;
import org.junit.rules.TemporaryFolder;

import java.io.File;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class LuceneFullTextIndexCommonTest extends FullTextIndexCommonTest {

private final ExecutorService executorService = Executors.newFixedThreadPool(2);
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder(new File("target"));

@Override
protected ContentRepository createRepository() {
repositoryOptionsUtil = new LuceneTestRepositoryBuilder(executorService, temporaryFolder).build();
indexOptions = new LuceneIndexOptions();
return repositoryOptionsUtil.getOak().createContentRepository();
}

@Override
protected void createTestIndexNode() {
setTraversalEnabled(false);
}

@After
public void shutdownExecutor() {
executorService.shutdown();
}
}
@@ -83,6 +83,9 @@ public class ElasticIndexDefinition extends IndexDefinition {
*/
private static final String INDEX_ORIGINAL_TERM = "indexOriginalTerm";

private static final String SPLIT_ON_CASE_CHANGE = "splitOnCaseChange";
private static final String SPLIT_ON_NUMERICS = "splitOnNumerics";

private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled";
private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;

@@ -228,11 +231,21 @@ protected String getDefaultFunctionName() {
/**
* Returns {@code true} if original terms need to be preserved at indexing analysis phase
*/
public boolean indexOriginalTerms() {
public boolean analyzerConfigIndexOriginalTerms() {
NodeState analyzersTree = definition.getChildNode(ANALYZERS);
return getOptionalValue(analyzersTree, INDEX_ORIGINAL_TERM, false);
}

public boolean analyzerConfigSplitOnCaseChange() {
NodeState analyzersTree = definition.getChildNode(ANALYZERS);
return getOptionalValue(analyzersTree, SPLIT_ON_CASE_CHANGE, false);
}

public boolean analyzerConfigSplitOnNumerics() {
NodeState analyzersTree = definition.getChildNode(ANALYZERS);
return getOptionalValue(analyzersTree, SPLIT_ON_NUMERICS, false);
}

@Override
protected PropertyDefinition createPropertyDefinition(IndexDefinition.IndexingRule rule, String name, NodeState nodeState) {
return new ElasticPropertyDefinition(rule, name, nodeState);
@@ -115,7 +115,9 @@ private static XContentBuilder loadSettings(ElasticIndexDefinition indexDefiniti
settingsBuilder.field("generate_word_parts", true);
settingsBuilder.field("stem_english_possessive", true);
settingsBuilder.field("generate_number_parts", true);
settingsBuilder.field("preserve_original", indexDefinition.indexOriginalTerms());
settingsBuilder.field("split_on_numerics", indexDefinition.analyzerConfigSplitOnNumerics());
settingsBuilder.field("split_on_case_change", indexDefinition.analyzerConfigSplitOnCaseChange());
settingsBuilder.field("preserve_original", indexDefinition.analyzerConfigIndexOriginalTerms());
}
settingsBuilder.endObject();

@@ -220,33 +220,6 @@ public void fullTextMultiTermQuery() throws Exception {
);
}

@Test
public void defaultAnalyzer() throws Exception {
IndexDefinitionBuilder builder = createIndex("analyzed_field");
builder.async("async");
builder.indexRule("nt:base")
.property("analyzed_field")
.analyzed().nodeScopeIndex();

setIndex(UUID.randomUUID().toString(), builder);
root.commit();

//add content
Tree test = root.getTree("/").addChild("test");

test.addChild("a").setProperty("analyzed_field", "sun.jpg");
root.commit();

assertEventually(() -> {
assertQuery("//*[jcr:contains(@analyzed_field, 'SUN.JPG')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(@analyzed_field, 'Sun')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(@analyzed_field, 'jpg')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'SUN.jpg')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'sun')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'jpg')] ", XPATH, Collections.singletonList("/test/a"));
});
}

@Test
public void fulltextWithModifiedNodeScopeIndex() throws Exception {
IndexDefinitionBuilder builder = createIndex("analyzed_field");
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;

import org.apache.jackrabbit.oak.api.ContentRepository;
import org.apache.jackrabbit.oak.plugins.index.FullTextIndexCommonTest;
import org.junit.ClassRule;

public class ElasticFullTextIndexCommonTest extends FullTextIndexCommonTest {

@ClassRule
public static final ElasticConnectionRule elasticRule =
new ElasticConnectionRule(ElasticTestUtils.ELASTIC_CONNECTION_STRING);

public ElasticFullTextIndexCommonTest() {
this.indexOptions = new ElasticIndexOptions();
}

@Override
protected ContentRepository createRepository() {
repositoryOptionsUtil = new ElasticTestRepositoryBuilder(elasticRule).build();
return repositoryOptionsUtil.getOak().createContentRepository();
}

@Override
protected void createTestIndexNode() {
setTraversalEnabled(false);
}
}
@@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index;

import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
import org.apache.jackrabbit.oak.query.AbstractQueryTest;
import org.junit.Test;

import java.util.Collections;
import java.util.UUID;

public abstract class FullTextIndexCommonTest extends AbstractQueryTest {

protected IndexOptions indexOptions;
protected TestRepository repositoryOptionsUtil;

protected void assertEventually(Runnable r) {
TestUtils.assertEventually(r,
((repositoryOptionsUtil.isAsync() ? repositoryOptionsUtil.defaultAsyncIndexingTimeInSeconds : 0) + 3000) * 5);
}

@Test
public void defaultAnalyzer() throws Exception {
IndexDefinitionBuilder builder = indexOptions.createIndex(
indexOptions.createIndexDefinitionBuilder(), false, "analyzed_field");
builder.noAsync();
builder.indexRule("nt:base")
.property("analyzed_field")
.analyzed().nodeScopeIndex();

indexOptions.setIndex(root, UUID.randomUUID().toString(), builder);
root.commit();

//add content
Tree test = root.getTree("/").addChild("test");

test.addChild("a").setProperty("analyzed_field", "sun.jpg");
root.commit();

assertEventually(() -> {
assertQuery("//*[jcr:contains(@analyzed_field, 'SUN.JPG')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(@analyzed_field, 'Sun')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(@analyzed_field, 'jpg')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'SUN.jpg')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'sun')] ", XPATH, Collections.singletonList("/test/a"));
assertQuery("//*[jcr:contains(., 'jpg')] ", XPATH, Collections.singletonList("/test/a"));
});
}

@Test
public void defaultAnalyzerHonourSplitOptions() throws Exception {
IndexDefinitionBuilder builder = indexOptions.createIndex(
indexOptions.createIndexDefinitionBuilder(), false, "analyzed_field");
builder.noAsync();
builder.indexRule("nt:base")
.property("analyzed_field")
.analyzed().nodeScopeIndex();

indexOptions.setIndex(root, UUID.randomUUID().toString(), builder);
root.commit();

//add content
Tree test = root.getTree("/").addChild("test");

test.addChild("a").setProperty("analyzed_field", "1234abCd5678");
root.commit();

assertEventually(() -> {
assertQuery("//*[jcr:contains(@analyzed_field, '1234')] ", XPATH, Collections.emptyList());
assertQuery("//*[jcr:contains(@analyzed_field, 'abcd')] ", XPATH, Collections.emptyList());
assertQuery("//*[jcr:contains(@analyzed_field, '5678')] ", XPATH, Collections.emptyList());
assertQuery("//*[jcr:contains(@analyzed_field, '1234abCd5678')] ", XPATH, Collections.singletonList("/test/a"));
});
}

}

0 comments on commit 26cf3f2

Please sign in to comment.