From 9c35b680626f164578a8b1c2a3ea9c5cd85e0868 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 4 Nov 2015 20:32:03 +0200 Subject: [PATCH] JENA-1062: configurable Lucene analyzer for jena-text --- .../text/analyzer/ConfigurableAnalyzer.java | 93 ++++++++++++++++ .../ConfigurableAnalyzerAssembler.java | 100 ++++++++++++++++++ .../query/text/assembler/TextAssembler.java | 1 + .../jena/query/text/assembler/TextVocab.java | 14 +++ .../org/apache/jena/query/text/TS_Text.java | 1 + .../TestDatasetWithConfigurableAnalyzer.java | 61 +++++++++++ .../assembler/TestEntityMapAssembler.java | 26 +++++ 7 files changed, 296 insertions(+) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java create mode 100644 jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java new file mode 100644 index 00000000000..ada3361360c --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.analyzer ; + +import java.io.Reader ; +import java.util.List ; + +import org.apache.jena.query.text.TextIndexException; +import org.apache.lucene.analysis.Analyzer ; +import org.apache.lucene.analysis.TokenFilter ; +import org.apache.lucene.analysis.Tokenizer ; +import org.apache.lucene.analysis.TokenStream ; +import org.apache.lucene.analysis.core.KeywordTokenizer ; +import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter ; +import org.apache.lucene.analysis.core.WhitespaceTokenizer ; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version ; + + +/** + * Lucene Analyzer implementation that can be configured with different + * Tokenizer and (optionally) TokenFilter implementations. + */ + +public class ConfigurableAnalyzer extends Analyzer { + private final Version version; + private final String tokenizer; + private final List filters; + + private Tokenizer getTokenizer(String tokenizerName, Reader reader) { + switch(tokenizerName) { + case "KeywordTokenizer": + return new KeywordTokenizer(reader); + case "LetterTokenizer": + return new LetterTokenizer(version, reader); + case "StandardTokenizer": + return new StandardTokenizer(version, reader); + case "WhitespaceTokenizer": + return new WhitespaceTokenizer(version, reader); + default: + throw new TextIndexException("Unknown tokenizer : " + tokenizerName); + } + } + + private TokenFilter getTokenFilter(String filterName, TokenStream source) { + switch(filterName) { + case "ASCIIFoldingFilter": + return new ASCIIFoldingFilter(source); + case "LowerCaseFilter": + return new LowerCaseFilter(version, source); + case "StandardFilter": + return new StandardFilter(version, source); + default: + throw new TextIndexException("Unknown filter : " + filterName); + } + } + + public ConfigurableAnalyzer(Version ver, String tokenizer, List filters) { + this.version = ver; + this.tokenizer = tokenizer; + this.filters = filters; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = getTokenizer(this.tokenizer, reader); + TokenStream stream = source; + for (String filter : this.filters) { + stream = getTokenFilter(filter, stream); + } + return new TokenStreamComponents(source, stream); + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java new file mode 100644 index 00000000000..d336ed80727 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.assembler.Mode; +import org.apache.jena.assembler.assemblers.AssemblerBase; +import org.apache.jena.query.text.TextIndexException; +import org.apache.jena.query.text.TextIndexLucene; +import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer; +import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.rdf.model.Statement ; +import org.apache.jena.vocabulary.RDF ; +import org.apache.lucene.analysis.Analyzer; + + +/** + * Assembler to create a configurable analyzer. + */ +public class ConfigurableAnalyzerAssembler extends AssemblerBase { + /* + text:map ( + [ text:field "text" ; + text:predicate rdfs:label; + text:analyzer [ + a text:ConfigurableAnalyzer ; + text:tokenizer text:LetterTokenizer ; + text:filters (text:LowerCaseFilter) + ] + ] + . + */ + + + @Override + public Analyzer open(Assembler a, Resource root, Mode mode) { + if (root.hasProperty(TextVocab.pTokenizer)) { + Resource tokenizerResource = root.getPropertyResourceValue(TextVocab.pTokenizer); + String tokenizer = tokenizerResource.getLocalName(); + List filters; + if (root.hasProperty(TextVocab.pFilters)) { + Resource filtersResource = root.getPropertyResourceValue(TextVocab.pFilters); + filters = toFilterList(filtersResource); + } else { + filters = new ArrayList<>(); + } + return new ConfigurableAnalyzer(TextIndexLucene.VER, tokenizer, filters); + } else { + throw new TextIndexException("text:tokenizer setting is required by ConfigurableAnalyzer"); + } + } + + private List toFilterList(Resource list) { + List result = new ArrayList<>(); + Resource current = list; + while (current != null && ! current.equals(RDF.nil)){ + Statement stmt = current.getProperty(RDF.first); + if (stmt == null) { + throw new TextIndexException("filter list not well formed"); + } + RDFNode node = stmt.getObject(); + if (! node.isResource()) { + throw new TextIndexException("filter is not a resource : " + node); + } + + result.add(node.asResource().getLocalName()); + stmt = current.getProperty(RDF.rest); + if (stmt == null) { + throw new TextIndexException("filter list not terminated by rdf:nil"); + } + node = stmt.getObject(); + if (! node.isResource()) { + throw new TextIndexException("filter list node is not a resource : " + node); + } + current = node.asResource(); + } + return result; + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java index 021c0030713..5f7ca4d998f 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java @@ -35,6 +35,7 @@ public static void init() Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ; + Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ; } } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java index fb14505e833..705b56541dd 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java @@ -43,6 +43,8 @@ public class TextVocab public static final Property pStoreValues = Vocab.property(NS, "storeValues") ; public static final Property pQueryAnalyzer = Vocab.property(NS, "queryAnalyzer") ; public static final Property pEntityMap = Vocab.property(NS, "entityMap") ; + public static final Property pTokenizer = Vocab.property(NS, "tokenizer") ; + public static final Property pFilters = Vocab.property(NS, "filters") ; // Entity definition public static final Resource entityMap = Vocab.resource(NS, "EntityMap") ; @@ -64,6 +66,18 @@ public class TextVocab public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer"); public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer"); public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer"); + public static final Resource configurableAnalyzer = Vocab.resource(NS, "ConfigurableAnalyzer"); + + // Tokenizers + public static final Resource standardTokenizer = Vocab.resource(NS, "StandardTokenizer"); + public static final Resource letterTokenizer = Vocab.resource(NS, "LetterTokenizer"); + public static final Resource keywordTokenizer = Vocab.resource(NS, "KeywordTokenizer"); + public static final Resource whitespaceTokenizer = Vocab.resource(NS, "WhitespaceTokenizer"); + + // Filters + public static final Resource standardFilter = Vocab.resource(NS, "StandardFilter"); + public static final Resource lowerCaseFilter = Vocab.resource(NS, "LowerCaseFilter"); + public static final Resource asciiFoldingFilter = Vocab.resource(NS, "ASCIIFoldingFilter"); } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java index 3459e4333a8..6e0be2cad4a 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java @@ -50,6 +50,7 @@ , TestDatasetWithLowerCaseKeywordAnalyzer.class , TestLuceneWithMultipleThreads.class , TestDatasetWithLocalizedAnalyzer.class + , TestDatasetWithConfigurableAnalyzer.class }) public class TS_Text diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java new file mode 100644 index 00000000000..ad3c4177a37 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import java.util.Set ; + +import org.apache.jena.atlas.lib.StrUtils ; +import org.apache.jena.ext.com.google.common.collect.Sets ; +import org.junit.Before ; +import org.junit.Test ; + +/** + * This class defines a setup configuration for a dataset that uses an ASCII folding lowercase keyword analyzer with a Lucene index. + */ +public class TestDatasetWithConfigurableAnalyzer extends TestDatasetWithLowerCaseKeywordAnalyzer { + @Override + @Before + public void before() { + init(StrUtils.strjoinNL( + "text:ConfigurableAnalyzer ;", + "text:tokenizer text:KeywordTokenizer ;", + "text:filters (text:ASCIIFoldingFilter text:LowerCaseFilter)" + )); + } + + @Test + public void testConfigurableAnalyzerIsCaseAndAccentInsensitive() { + final String testName = "testConfigurableAnalyzerIsCaseAndAccentInsensitive"; + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + testName + ">", + " rdfs:label 'Feeling a déjà vu'", + "." + ); + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label '\"feeling ä déja\"*' 10 ) .", + "}" + ); + Set expectedURIs = Sets.newHashSet(RESOURCE_BASE + testName); + doTestSearch(turtle, queryString, expectedURIs); + } +} diff --git a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java index ab3ed299953..e4c823d0652 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java @@ -29,6 +29,7 @@ import org.apache.jena.graph.Node ; import org.apache.jena.query.text.EntityDefinition ; import org.apache.jena.query.text.TextIndexException ; +import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer ; import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ; import org.apache.jena.rdf.model.* ; import org.apache.jena.vocabulary.RDF ; @@ -56,6 +57,7 @@ public class TestEntityMapAssembler { private static final Resource spec4; private static final Resource spec5; private static final Resource spec6; + private static final Resource spec7; private static final Resource specNoEntityField; private static final Resource specNoDefaultField; private static final Resource specNoMapProperty; @@ -119,6 +121,12 @@ private Object getOne(EntityDefinition entityDef, String field) { assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); } + @Test public void EntityHasMapEntryWithConfigurableAnalyzer() { + EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); + EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec7, null); + assertEquals(ConfigurableAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); + } + @Test(expected=TextIndexException.class) public void errorOnNoEntityField() { EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); entDefAssem.open(null, specNoEntityField, null); @@ -254,6 +262,24 @@ private Object getOne(EntityDefinition entityDef, String field) { .addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer)) })); + + // create an entity map specification using a configurable analyzer + + spec7 = model.createResource(TESTBASE + "spec7") + .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD) + .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pMap, + model.createList( + new RDFNode[] { + model.createResource() + .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE) + .addProperty(TextVocab.pAnalyzer, + model.createResource() + .addProperty(RDF.type, TextVocab.configurableAnalyzer) + .addProperty(TextVocab.pTokenizer, TextVocab.standardTokenizer)) + })); + // bad assembler spec specNoEntityField =