Skip to content
Permalink
Browse files
OAK-9788: Add an index property to disable similarity for lucene index (
#589)

Introduced index definition properties, indexSimilarityBinaries and indexSimilarityStrings to enable/disable similarity indexing for binaries and strings respectively.
  • Loading branch information
tihom88 committed Jun 20, 2022
1 parent d3d824b commit 353dc3c7998c1e654cecda3d5969917f7a7d2742
Showing 7 changed files with 271 additions and 6 deletions.
@@ -170,5 +170,4 @@ public interface LuceneIndexConstants extends FulltextIndexConstants {
*/
@Deprecated
String INDEX_PATH = "indexPath";

}
@@ -19,9 +19,14 @@

package org.apache.jackrabbit.oak.plugins.index.lucene.util;

import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;

import java.util.Arrays;

import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TYPE_LUCENE;

public final class LuceneIndexDefinitionBuilder extends IndexDefinitionBuilder {
@@ -41,5 +46,4 @@ public LuceneIndexDefinitionBuilder(NodeBuilder nodeBuilder, boolean autoManageR
protected String getIndexType() {
return TYPE_LUCENE;
}

}
@@ -2988,6 +2988,122 @@ public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
}
}

@Test
public void testRepSimilarWithBinaryFeatureVectorsWithIndexSimilarityBinariesDefinedAsLucene() throws Exception {

IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityBinaries("lucene");
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();

Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
idxb.build(idx);
root.commit();

Tree test = root.getTree("/").addChild("test");

URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);

Collection<String> children = new LinkedList<>();
for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = new LinkedList<>();
int i = 0;
for (String s : split) {
if (i > 0) {
values.add(Double.parseDouble(s));
}
i++;
}

byte[] bytes = SimSearchUtils.toByteArray(values);
List<Double> actual = SimSearchUtils.toDoubles(bytes);
assertEquals(values, actual);

Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
String name = split[0];
Tree child = test.addChild(name);
child.setProperty("fv", blob, Type.BINARY);
children.add(child.getPath());
}
root.commit();

// check that similarity changes across different feature vectors
List<String> baseline = new LinkedList<>();
for (String similarPath : children) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";

Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
List<String> current = new LinkedList<>();
while (result.hasNext()) {
String next = result.next();
current.add(next);
}
assertNotEquals(baseline, current);
baseline.clear();
baseline.addAll(current);
}
}

/**
* To disable similarity for binaries the index type should not be in present as value for FulltextIndexConstants.INDEX_SIMILARITY_BINARIES.
* In this case index type is lucene but indexSimilarityBinaries is set to elasticsearch
*
* @throws Exception
*/
@Test
public void testRepSimilarWithBinaryFeatureVectorsWithIndexSimilarityBinariesDefinedAsElasticsearch() throws Exception {

IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityBinaries("elasticsearch");
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();


Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
idxb.build(idx);
root.commit();

Tree test = root.getTree("/").addChild("test");

URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);

Collection<String> children = new LinkedList<>();
for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = new LinkedList<>();
int i = 0;
for (String s : split) {
if (i > 0) {
values.add(Double.parseDouble(s));
}
i++;
}

byte[] bytes = SimSearchUtils.toByteArray(values);
List<Double> actual = SimSearchUtils.toDoubles(bytes);
assertEquals(values, actual);

Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
String name = split[0];
Tree child = test.addChild(name);
child.setProperty("fv", blob, Type.BINARY);
children.add(child.getPath());
}
root.commit();

// check that similarity changes across different feature vectors
for (String similarPath : children) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";

Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
List<String> current = new LinkedList<>();
while (result.hasNext()) {
String next = result.next();
current.add(next);
}
assertEquals("binary data for similarity should not be indexed", 0, current.size());
}
}

@Test
public void testRepSimilarWithStringFeatureVectors() throws Exception {

@@ -3032,6 +3148,97 @@ public void testRepSimilarWithStringFeatureVectors() throws Exception {
}
}

@Test
public void testRepSimilarWithStringFeatureVectorsWithIndexSimilarityStringsDefinedAsLucene() throws Exception {

IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityStrings("lucene");
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();

Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
idxb.build(idx);
root.commit();

Tree test = root.getTree("/").addChild("test");

URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);

Collection<String> children = new LinkedList<>();

for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
int i1 = line.indexOf(',');
String name = line.substring(0, i1);
String value = line.substring(i1 + 1);
Tree child = test.addChild(name);
child.setProperty("fv", value, Type.STRING);
children.add(child.getPath());
}
root.commit();

// check that similarity changes across different feature vectors
List<String> baseline = new LinkedList<>();
for (String similarPath : children) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";

Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
List<String> current = new LinkedList<>();
while (result.hasNext()) {
String next = result.next();
current.add(next);
}
assertNotEquals(baseline, current);
baseline.clear();
baseline.addAll(current);
}
}

/**
* To disable similarity for strings the index type should not be in present as value for FulltextIndexConstants.INDEX_SIMILARITY_STRINGS.
* In this case index type is lucene but indexSimilarityStrings is set to elasticsearch
*
* @throws Exception
*/
@Test
public void testRepSimilarWithStringFeatureVectorsWithIndexSimilarityStringsDefinedAsElasticsearch() throws Exception {

IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityStrings("elasticsearch");
idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();

Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
idxb.build(idx);
root.commit();

Tree test = root.getTree("/").addChild("test");

URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);

Collection<String> children = new LinkedList<>();

for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
int i1 = line.indexOf(',');
String name = line.substring(0, i1);
String value = line.substring(i1 + 1);
Tree child = test.addChild(name);
child.setProperty("fv", value, Type.STRING);
children.add(child.getPath());
}
root.commit();

// check that similarity changes across different feature vectors
for (String similarPath : children) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";

Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
List<String> current = new LinkedList<>();
while (result.hasNext()) {
String next = result.next();
current.add(next);
}
assertEquals("String data for similarity should not be indexed", 0, current.size());
}
}

@Test
public void testRepSimilarWithBinaryFeatureVectorsAndRerank() throws Exception {

@@ -401,4 +401,16 @@ public static IndexingMode from(String indexingMode) {
* cost). The value is: nodes, the path. For properties, the path of the node, then '@' property.
*/
String USE_IF_EXISTS = "useIfExists";

/**
* Boolean property to enable or disable indexing of binaries for similarity searches.
* By default the value of this property is true.
*/
String INDEX_SIMILARITY_BINARIES = "indexSimilarityBinaries";

/**
* Boolean property to enable or disable indexing of strings for similarity searches.
* By default the value of this property is true.
*/
String INDEX_SIMILARITY_STRINGS = "indexSimilarityStrings";
}
@@ -24,6 +24,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -194,6 +195,9 @@ public class IndexDefinition implements Aggregate.AggregateMapper {
public static final OrderEntry NATIVE_SORT_ORDER = new OrderEntry(JCR_SCORE, Type.UNDEFINED,
OrderEntry.Order.DESCENDING);

private boolean indexSimilarityBinaries;
private boolean indexSimilarityStrings;

/**
* Dynamic boost uses index time boosting. This requires to have a separate field for each unique term that needs to
* be boosted. With a high number of terms (thousands), this could result in a sparse index that requires extra disk
@@ -488,13 +492,29 @@ protected IndexDefinition(NodeState root, NodeState defn, IndexFormatVersion ver
defn.getProperty(TYPE_PROPERTY_NAME) != null &&
DYNAMIC_BOOST_LITE.contains(defn.getProperty(TYPE_PROPERTY_NAME).getValue(Type.STRING))
);
this.indexSimilarityBinaries = getSimilarityDefaultValue(defn, INDEX_SIMILARITY_BINARIES);
this.indexSimilarityStrings = getSimilarityDefaultValue(defn, INDEX_SIMILARITY_STRINGS);
} catch (IllegalStateException e) {
log.error("Config error for index definition at {} . Please correct the index definition "
+ "and reindex after correction. Additional Info : {}", indexPath, e.getMessage(), e);
throw new IllegalStateException(e);
}
}

private boolean getSimilarityDefaultValue(NodeState defn, String propertyKey) {
return defn.getProperty(propertyKey) == null // = true in case this property is not defined
|| (defn.getProperty(TYPE_PROPERTY_NAME) != null && isPresent(defn.getProperty(TYPE_PROPERTY_NAME).getValue(Type.STRING), defn.getProperty(propertyKey).getValue(Type.STRINGS).iterator()));
}

private <T> boolean isPresent(T key, Iterator<T> iterator) {
while (iterator.hasNext()){
if (key.equals(iterator.next())) {
return true;
}
}
return false;
}

public NodeState getDefinitionNodeState() {
return definition;
}
@@ -519,6 +539,14 @@ public boolean isEnabled() {
return true;
}

public boolean shouldIndexSimilarityBinaries() {
return indexSimilarityBinaries;
}

public boolean shouldIndexSimilarityStrings() {
return indexSimilarityStrings;
}

public boolean isDynamicBoostLiteEnabled() {
return dynamicBoostLite;
}
@@ -242,9 +242,11 @@ private boolean indexProperty(String path,
boolean dirty = false;
if (Type.BINARY.tag() == property.getType().tag() && pd.useInSimilarity) {
try {
log.trace("indexing similarity binaries for {}", pd.name);
indexSimilarityBinaries(doc, pd, property.getValue(Type.BINARY));
dirty = true;
if (definition.shouldIndexSimilarityBinaries()) {
log.trace("indexing similarity binaries for {}", pd.name);
indexSimilarityBinaries(doc, pd, property.getValue(Type.BINARY));
dirty = true;
}
} catch (Exception e) {
log.error("could not index similarity field for property {} and definition {}", property, pd);
}
@@ -295,7 +297,9 @@ private boolean indexProperty(String path,
log.trace("indexing similarity strings for {}", pd.name);
try {
// fallback for when feature vectors are written in string typed properties
indexSimilarityStrings(doc, pd, value);
if (definition.shouldIndexSimilarityStrings()) {
indexSimilarityStrings(doc, pd, value);
}
} catch (Exception e) {
log.error("could not index similarity field for property {} and definition {}", property, pd);
}
@@ -36,6 +36,7 @@
import javax.jcr.Node;
import javax.jcr.PropertyType;
import javax.jcr.RepositoryException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -95,6 +96,16 @@ public IndexDefinitionBuilder evaluatePathRestrictions() {
return this;
}

public IndexDefinitionBuilder indexSimilarityBinaries(String... indexNames) {
getBuilderTree().setProperty(FulltextIndexConstants.INDEX_SIMILARITY_BINARIES, Arrays.asList(indexNames), Type.STRINGS);
return this;
}

public IndexDefinitionBuilder indexSimilarityStrings(String... indexNames) {
getBuilderTree().setProperty(FulltextIndexConstants.INDEX_SIMILARITY_STRINGS, Arrays.asList(indexNames), Type.STRINGS);
return this;
}

public IndexDefinitionBuilder includedPaths(String... paths) {
tree.setProperty(PathFilter.PROP_INCLUDED_PATHS, asList(paths), STRINGS);
return this;

0 comments on commit 353dc3c

Please sign in to comment.