From 5e4e1c1432f44151356fe25cc44c87e0085c1873 Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Tue, 21 Apr 2015 14:19:32 -0400 Subject: [PATCH 1/9] change on pom.xml to have local groupId --- jena-text/pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jena-text/pom.xml b/jena-text/pom.xml index b33ca6fdfc1..f3d01fbb926 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -18,6 +18,7 @@ 4.0.0 + licef jena-text jar Apache Jena - SPARQL Text Search @@ -139,6 +140,7 @@ **/TS_*.java + true From d3f21853c0d0556ad95ae06c393fb8a8619feb35 Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Wed, 22 Apr 2015 14:55:58 -0400 Subject: [PATCH 2/9] Introducing Lucene multilingual index --- jena-text/pom.xml | 1 - .../main/java/examples/JenaTextExample1.java | 2 +- .../apache/jena/query/text/LuceneUtil.java | 93 ++++++++++++ .../jena/query/text/TextDatasetFactory.java | 101 ++++++++++++- .../query/text/TextDocProducerTriples.java | 7 +- .../jena/query/text/TextIndexLucene.java | 14 +- .../text/TextIndexLuceneMultiLingual.java | 138 ++++++++++++++++++ .../apache/jena/query/text/TextQueryPF.java | 23 +++ .../assembler/TextIndexLuceneAssembler.java | 2 +- ...ctTestDatasetWithLuceneGraphTextIndex.java | 2 +- .../jena/query/text/TestBuildTextDataset.java | 2 +- .../text/TestLuceneWithMultipleThreads.java | 6 +- .../apache/jena/query/text/TestTextTDB.java | 2 +- 13 files changed, 370 insertions(+), 23 deletions(-) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java diff --git a/jena-text/pom.xml b/jena-text/pom.xml index f3d01fbb926..3eb2d347ec5 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -140,7 +140,6 @@ **/TS_*.java - true diff --git a/jena-text/src/main/java/examples/JenaTextExample1.java b/jena-text/src/main/java/examples/JenaTextExample1.java index 631a0963e9e..6b26e390a86 100644 --- a/jena-text/src/main/java/examples/JenaTextExample1.java +++ b/jena-text/src/main/java/examples/JenaTextExample1.java @@ -65,7 +65,7 @@ public static Dataset createCode() Directory dir = new RAMDirectory(); // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; return ds ; } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java new file mode 100644 index 00000000000..a4c6145660b --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.Version; +import java.lang.reflect.Constructor; +import java.util.Hashtable; + +public class LuceneUtil { + + private static Hashtable analyzers; //mapping between ISO2-letter language and lucene existing analyzers + + static { + initAnalyzerDefs(); + } + + public static Analyzer createAnalyzer(String lang, Version ver) { + lang = getISO2Language(lang); + if (lang == null) + return null; + + try { + Class className = analyzers.get(lang); + if (className == null) + return null; + Constructor constructor = className.getConstructor(Version.class); + return (Analyzer)constructor.newInstance(ver); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + public static String getISO2Language(String lang) { + if (lang == null) + return null; + else + return lang.split("-")[0].toLowerCase(); + } + + private static void initAnalyzerDefs() { + analyzers = new Hashtable<>(); + analyzers.put("ar", org.apache.lucene.analysis.ar.ArabicAnalyzer.class); + analyzers.put("bg", org.apache.lucene.analysis.bg.BulgarianAnalyzer.class); + analyzers.put("ca", org.apache.lucene.analysis.ca.CatalanAnalyzer.class); + analyzers.put("cs", org.apache.lucene.analysis.cz.CzechAnalyzer.class); + analyzers.put("da", org.apache.lucene.analysis.da.DanishAnalyzer.class); + analyzers.put("de", org.apache.lucene.analysis.de.GermanAnalyzer.class); + analyzers.put("el", org.apache.lucene.analysis.el.GreekAnalyzer.class); + analyzers.put("en", org.apache.lucene.analysis.en.EnglishAnalyzer.class); + analyzers.put("es", org.apache.lucene.analysis.es.SpanishAnalyzer.class); + analyzers.put("eu", org.apache.lucene.analysis.eu.BasqueAnalyzer.class); + analyzers.put("fa", org.apache.lucene.analysis.fa.PersianAnalyzer.class); + analyzers.put("fi", org.apache.lucene.analysis.fi.FinnishAnalyzer.class); + analyzers.put("fr", org.apache.lucene.analysis.fr.FrenchAnalyzer.class); + analyzers.put("ga", org.apache.lucene.analysis.ga.IrishAnalyzer.class); + analyzers.put("gl", org.apache.lucene.analysis.gl.GalicianAnalyzer.class); + analyzers.put("hi", org.apache.lucene.analysis.hi.HindiAnalyzer.class); + analyzers.put("hu", org.apache.lucene.analysis.hu.HungarianAnalyzer.class); + analyzers.put("hy", org.apache.lucene.analysis.hy.ArmenianAnalyzer.class); + analyzers.put("id", org.apache.lucene.analysis.id.IndonesianAnalyzer.class); + analyzers.put("it", org.apache.lucene.analysis.it.ItalianAnalyzer.class); + analyzers.put("ja", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + analyzers.put("ko", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + analyzers.put("lv", org.apache.lucene.analysis.lv.LatvianAnalyzer.class); + analyzers.put("nl", org.apache.lucene.analysis.nl.DutchAnalyzer.class); + analyzers.put("no", org.apache.lucene.analysis.no.NorwegianAnalyzer.class); + analyzers.put("pt", org.apache.lucene.analysis.pt.PortugueseAnalyzer.class); + analyzers.put("ro", org.apache.lucene.analysis.ro.RomanianAnalyzer.class); + analyzers.put("ru", org.apache.lucene.analysis.ru.RussianAnalyzer.class); + analyzers.put("sv", org.apache.lucene.analysis.sv.SwedishAnalyzer.class); + analyzers.put("th", org.apache.lucene.analysis.th.ThaiAnalyzer.class); + analyzers.put("tr", org.apache.lucene.analysis.tr.TurkishAnalyzer.class); + analyzers.put("zh", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java index c7e42082346..45aac330a50 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java @@ -29,6 +29,8 @@ import com.hp.hpl.jena.sparql.core.assembler.AssemblerUtils ; import com.hp.hpl.jena.sparql.util.Context ; +import java.io.File; + public class TextDatasetFactory { static { TextQuery.init(); } @@ -90,43 +92,128 @@ public static DatasetGraph create(DatasetGraph dsg, TextIndex textIndex, boolean * * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static TextIndex createLuceneIndex(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static TextIndex createLuceneIndex(Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = new TextIndexLucene(directory, def, queryAnalyzer) ; + TextIndex index = new TextIndexLucene(directory, def, analyzer, queryAnalyzer) ; return index ; } + /** + * Create a localized Lucene TextIndex + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static TextIndex createLuceneIndexFromLanguage(Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + return createLuceneIndex(directory, def, LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER), queryAnalyzer); + } + + /** + * Create a multilingual Lucene TextIndex + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static TextIndex createLuceneIndexMultiLingual(File directory, EntityDefinition def) + { + TextIndex index = new TextIndexLuceneMultiLingual(directory, def) ; + return index ; + } + /** * Create a text-indexed dataset, using Lucene * * @param base the base Dataset * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static Dataset createLucene(Dataset base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static Dataset createLucene(Dataset base, Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + TextIndex index = createLuceneIndex(directory, def, analyzer, queryAnalyzer) ; return create(base, index, true) ; } + /** + * Create a localized text-indexed dataset, using Lucene + * + * @param base the base Dataset + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static Dataset createLuceneFromLanguage(Dataset base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; + return create(base, index, true) ; + } + + /** + * Create a multilingual text-indexed dataset, using Lucene + * + * @param base the base Dataset + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static Dataset createLuceneMultilingual(Dataset base, File directory, EntityDefinition def) + { + TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + return create(base, index, true) ; + } + /** * Create a text-indexed dataset, using Lucene * * @param base the base DatasetGraph * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static DatasetGraph createLucene(DatasetGraph base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static DatasetGraph createLucene(DatasetGraph base, Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + TextIndex index = createLuceneIndex(directory, def, analyzer, queryAnalyzer) ; return create(base, index, true) ; } - /** Create a Solr TextIndex */ + /** + * Create a localized text-indexed dataset, using Lucene + * + * @param base the base DatasetGraph + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static DatasetGraph createLuceneFromLanguage(DatasetGraph base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; + return create(base, index, true) ; + } + + /** + * Create a multilingual text-indexed dataset, using Lucene + * + * @param base the base DatasetGraph + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static DatasetGraph createLuceneMultilingual(DatasetGraph base, File directory, EntityDefinition def) + { + TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + return create(base, index, true) ; + } + + + /** Create a Solr TextIndex */ public static TextIndex createSolrIndex(SolrServer server, EntityDefinition entMap) { TextIndex index = new TextIndexSolr(server, entMap) ; diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java index b295148d961..8346261ffe0 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java @@ -63,7 +63,12 @@ public void change(QuadAction qaction, Node g, Node s, Node p, Node o) { Entity entity = TextQueryFuncs.entityFromQuad(defn, g, s, p, o) ; // Null means does not match defn if ( entity != null ) { - indexer.addEntity(entity) ; + if (indexer instanceof TextIndexLuceneMultiLingual) { + String lang = o.getLiteral().language(); + ((TextIndexLuceneMultiLingual)indexer).addEntity(entity, lang); + } + else + indexer.addEntity(entity) ; // Auto commit the entity if we aren't in a transaction if (!inTransaction.get()) { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java index caebea18b74..77f09908396 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java @@ -89,9 +89,10 @@ public class TextIndexLucene implements TextIndex { * * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) { + public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { this.directory = directory ; this.docDef = def ; @@ -103,14 +104,15 @@ public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer query analyzerPerField.put(def.getGraphField(), new KeywordAnalyzer()) ; for (String field : def.fields()) { - Analyzer analyzer = def.getAnalyzer(field); - if (analyzer != null) { - analyzerPerField.put(field, analyzer); + Analyzer _analyzer = def.getAnalyzer(field); + if (_analyzer != null) { + analyzerPerField.put(field, _analyzer); } } - this.analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(VER), analyzerPerField) ; - this.queryAnalyzer = (null != queryAnalyzer) ? queryAnalyzer : analyzer ; + this.analyzer = new PerFieldAnalyzerWrapper( + (null != analyzer) ? analyzer : new StandardAnalyzer(VER), analyzerPerField) ; + this.queryAnalyzer = (null != queryAnalyzer) ? queryAnalyzer : this.analyzer ; openIndexWriter(); } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java new file mode 100644 index 00000000000..0e9608693d8 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import com.hp.hpl.jena.graph.Node; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class TextIndexLuceneMultiLingual implements TextIndex { + + Hashtable indexes; + File indexDir; + private final EntityDefinition docDef; + + public TextIndexLuceneMultiLingual(File directory, EntityDefinition def) { + docDef = def; + indexes = new Hashtable<>(); + + try { + //default index created first. Localized index will be created on the fly. + indexDir = directory; + Directory dir = FSDirectory.open(indexDir); + TextIndex index = new TextIndexLucene(dir, def, null, null); + indexes.put("default", index); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + public Collection getIndexes() { + return indexes.values(); + } + + TextIndex getIndex(String lang) { + lang = LuceneUtil.getISO2Language(lang); + if (lang == null) + lang = "default"; + + if (!indexes.containsKey(lang)) { + //dynamic creation of localized index + try { + Analyzer analyzer = LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER); + if (analyzer != null) { + File indexDirLang = new File(indexDir, lang); + Directory dir = FSDirectory.open(indexDirLang); + TextIndex index = new TextIndexLucene(dir, docDef, analyzer, null); + indexes.put(lang, index); + } + else + lang = "default"; + } catch (IOException e) { + e.printStackTrace(); + } + } + + return indexes.get(lang); + } + + @Override + public void prepareCommit() { + for (TextIndex index : indexes.values()) + index.prepareCommit(); + } + + @Override + public void commit() { + for (TextIndex index : indexes.values()) + index.commit(); + } + + @Override + public void rollback() { + for (TextIndex index : indexes.values()) + index.rollback(); + } + + @Override + public void addEntity(Entity entity) { + } + + public void addEntity(Entity entity, String lang) { + getIndex(lang).addEntity(entity); + } + + @Override + public void updateEntity(Entity entity) { + + } + + @Override + public Map get(String uri) { + return null; + } + + @Override + public List query(String qs, int limit) { + return null; + } + + @Override + public List query(String qs) { + return null; + } + + @Override + public EntityDefinition getDocDef() { + return docDef; + } + + @Override + public void close() { + for (TextIndex index : indexes.values()) + index.close(); + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java index a17308b4a63..f2658e2c7a8 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java @@ -18,6 +18,7 @@ package org.apache.jena.query.text ; +import java.util.Iterator; import java.util.List ; import org.apache.jena.atlas.iterator.Iter ; @@ -77,6 +78,16 @@ public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, if (list.size() > 4) throw new QueryBuildException("Too many arguments in list : " + list) ; } + + // If retrieved index is an instance of TextIndexLuceneMultiLingual, we need to switch with the right index. + // The pattern is : + // ?uri text:query (property 'string' ['lang:language']) + // ex : ?uri text:query (rdfs:label 'livre' 'lang:fr') + // note: default index is the unlocalized index (if lang arg is not present). + if (server instanceof TextIndexLuceneMultiLingual) { + String lang = getArg("lang", argObject); + server = ((TextIndexLuceneMultiLingual)server).getIndex(lang); + } } private static TextIndex chooseTextIndex(DatasetGraph dsg) { @@ -101,6 +112,18 @@ private static TextIndex chooseTextIndex(DatasetGraph dsg) { return null ; } + private String getArg(String prefix, PropFuncArg argObject) { + for (Iterator it = argObject.getArgList().iterator(); it.hasNext(); ) { + Node node = (Node)it.next(); + if (node.isLiteral()) { + String arg = node.getLiteral().toString(); + if (arg.startsWith(prefix + ":")) + return arg.split(":")[1]; + } + } + return null; + } + @Override public QueryIterator exec(Binding binding, PropFuncArg argSubject, Node predicate, PropFuncArg argObject, ExecutionContext execCxt) { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java index 670c530751f..75a27bb61af 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java @@ -93,7 +93,7 @@ public TextIndex open(Assembler a, Resource root, Mode mode) { Resource r = GraphUtils.getResourceValue(root, pEntityMap) ; EntityDefinition docDef = (EntityDefinition)a.open(r) ; - return TextDatasetFactory.createLuceneIndex(directory, docDef, queryAnalyzer) ; + return TextDatasetFactory.createLuceneIndex(directory, docDef, null, queryAnalyzer) ; } catch (IOException e) { IO.exception(e) ; return null ; diff --git a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java index 1011ba4ed67..417fe1cfc9d 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java @@ -39,7 +39,7 @@ public void init() { Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", "graph", RDFS.label.asNode()) ; eDef.set("comment", RDFS.comment.asNode()) ; // some tests require indexing rdfs:comment - TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; dataset = TextDatasetFactory.create(ds1, tidx) ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java index e045477de5f..6a22802ac5b 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java @@ -117,7 +117,7 @@ public static Dataset createCode() { Directory dir = new RAMDirectory() ; // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; return ds ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java index b14526ee315..ffc746b1a25 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java @@ -63,7 +63,7 @@ public class TestLuceneWithMultipleThreads @Test public void testReadInMiddleOfWrite() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newSingleThreadExecutor(); final Future f = execService.submit(new Runnable() @@ -119,7 +119,7 @@ public void run() @Test public void testWriteInMiddleOfRead() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); final int numReads = 10; final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newFixedThreadPool(10); @@ -187,7 +187,7 @@ public void run() @Test public void testIsolation() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null, null); final int numReaders = 2; final List> futures = new ArrayList>(numReaders); diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java index a80d399c06e..cb592714d27 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java @@ -40,7 +40,7 @@ private static Dataset create() { Dataset ds1 = TDBFactory.createDataset() ; Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", RDFS.label) ; - TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; Dataset ds = TextDatasetFactory.create(ds1, tidx) ; return ds ; } From abdc602fe505167562b7ce9218433bf7c99f2f9e Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Tue, 21 Apr 2015 14:19:32 -0400 Subject: [PATCH 3/9] change on pom.xml to have local groupId --- jena-text/pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jena-text/pom.xml b/jena-text/pom.xml index b33ca6fdfc1..f3d01fbb926 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -18,6 +18,7 @@ 4.0.0 + licef jena-text jar Apache Jena - SPARQL Text Search @@ -139,6 +140,7 @@ **/TS_*.java + true From a88b6e47a8ab0d595a1a7077f46fd8396ae3e89d Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Wed, 22 Apr 2015 14:55:58 -0400 Subject: [PATCH 4/9] Introducing Lucene multilingual index --- jena-text/pom.xml | 1 - .../main/java/examples/JenaTextExample1.java | 2 +- .../apache/jena/query/text/LuceneUtil.java | 93 ++++++++++++ .../jena/query/text/TextDatasetFactory.java | 101 ++++++++++++- .../query/text/TextDocProducerTriples.java | 7 +- .../jena/query/text/TextIndexLucene.java | 14 +- .../text/TextIndexLuceneMultiLingual.java | 138 ++++++++++++++++++ .../apache/jena/query/text/TextQueryPF.java | 23 +++ .../assembler/TextIndexLuceneAssembler.java | 2 +- ...ctTestDatasetWithLuceneGraphTextIndex.java | 2 +- .../jena/query/text/TestBuildTextDataset.java | 2 +- .../text/TestLuceneWithMultipleThreads.java | 6 +- .../apache/jena/query/text/TestTextTDB.java | 2 +- 13 files changed, 370 insertions(+), 23 deletions(-) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java diff --git a/jena-text/pom.xml b/jena-text/pom.xml index f3d01fbb926..3eb2d347ec5 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -140,7 +140,6 @@ **/TS_*.java - true diff --git a/jena-text/src/main/java/examples/JenaTextExample1.java b/jena-text/src/main/java/examples/JenaTextExample1.java index 631a0963e9e..6b26e390a86 100644 --- a/jena-text/src/main/java/examples/JenaTextExample1.java +++ b/jena-text/src/main/java/examples/JenaTextExample1.java @@ -65,7 +65,7 @@ public static Dataset createCode() Directory dir = new RAMDirectory(); // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; return ds ; } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java new file mode 100644 index 00000000000..a4c6145660b --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.Version; +import java.lang.reflect.Constructor; +import java.util.Hashtable; + +public class LuceneUtil { + + private static Hashtable analyzers; //mapping between ISO2-letter language and lucene existing analyzers + + static { + initAnalyzerDefs(); + } + + public static Analyzer createAnalyzer(String lang, Version ver) { + lang = getISO2Language(lang); + if (lang == null) + return null; + + try { + Class className = analyzers.get(lang); + if (className == null) + return null; + Constructor constructor = className.getConstructor(Version.class); + return (Analyzer)constructor.newInstance(ver); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + public static String getISO2Language(String lang) { + if (lang == null) + return null; + else + return lang.split("-")[0].toLowerCase(); + } + + private static void initAnalyzerDefs() { + analyzers = new Hashtable<>(); + analyzers.put("ar", org.apache.lucene.analysis.ar.ArabicAnalyzer.class); + analyzers.put("bg", org.apache.lucene.analysis.bg.BulgarianAnalyzer.class); + analyzers.put("ca", org.apache.lucene.analysis.ca.CatalanAnalyzer.class); + analyzers.put("cs", org.apache.lucene.analysis.cz.CzechAnalyzer.class); + analyzers.put("da", org.apache.lucene.analysis.da.DanishAnalyzer.class); + analyzers.put("de", org.apache.lucene.analysis.de.GermanAnalyzer.class); + analyzers.put("el", org.apache.lucene.analysis.el.GreekAnalyzer.class); + analyzers.put("en", org.apache.lucene.analysis.en.EnglishAnalyzer.class); + analyzers.put("es", org.apache.lucene.analysis.es.SpanishAnalyzer.class); + analyzers.put("eu", org.apache.lucene.analysis.eu.BasqueAnalyzer.class); + analyzers.put("fa", org.apache.lucene.analysis.fa.PersianAnalyzer.class); + analyzers.put("fi", org.apache.lucene.analysis.fi.FinnishAnalyzer.class); + analyzers.put("fr", org.apache.lucene.analysis.fr.FrenchAnalyzer.class); + analyzers.put("ga", org.apache.lucene.analysis.ga.IrishAnalyzer.class); + analyzers.put("gl", org.apache.lucene.analysis.gl.GalicianAnalyzer.class); + analyzers.put("hi", org.apache.lucene.analysis.hi.HindiAnalyzer.class); + analyzers.put("hu", org.apache.lucene.analysis.hu.HungarianAnalyzer.class); + analyzers.put("hy", org.apache.lucene.analysis.hy.ArmenianAnalyzer.class); + analyzers.put("id", org.apache.lucene.analysis.id.IndonesianAnalyzer.class); + analyzers.put("it", org.apache.lucene.analysis.it.ItalianAnalyzer.class); + analyzers.put("ja", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + analyzers.put("ko", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + analyzers.put("lv", org.apache.lucene.analysis.lv.LatvianAnalyzer.class); + analyzers.put("nl", org.apache.lucene.analysis.nl.DutchAnalyzer.class); + analyzers.put("no", org.apache.lucene.analysis.no.NorwegianAnalyzer.class); + analyzers.put("pt", org.apache.lucene.analysis.pt.PortugueseAnalyzer.class); + analyzers.put("ro", org.apache.lucene.analysis.ro.RomanianAnalyzer.class); + analyzers.put("ru", org.apache.lucene.analysis.ru.RussianAnalyzer.class); + analyzers.put("sv", org.apache.lucene.analysis.sv.SwedishAnalyzer.class); + analyzers.put("th", org.apache.lucene.analysis.th.ThaiAnalyzer.class); + analyzers.put("tr", org.apache.lucene.analysis.tr.TurkishAnalyzer.class); + analyzers.put("zh", org.apache.lucene.analysis.cjk.CJKAnalyzer.class); + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java index c7e42082346..45aac330a50 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java @@ -29,6 +29,8 @@ import com.hp.hpl.jena.sparql.core.assembler.AssemblerUtils ; import com.hp.hpl.jena.sparql.util.Context ; +import java.io.File; + public class TextDatasetFactory { static { TextQuery.init(); } @@ -90,43 +92,128 @@ public static DatasetGraph create(DatasetGraph dsg, TextIndex textIndex, boolean * * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static TextIndex createLuceneIndex(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static TextIndex createLuceneIndex(Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = new TextIndexLucene(directory, def, queryAnalyzer) ; + TextIndex index = new TextIndexLucene(directory, def, analyzer, queryAnalyzer) ; return index ; } + /** + * Create a localized Lucene TextIndex + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static TextIndex createLuceneIndexFromLanguage(Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + return createLuceneIndex(directory, def, LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER), queryAnalyzer); + } + + /** + * Create a multilingual Lucene TextIndex + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static TextIndex createLuceneIndexMultiLingual(File directory, EntityDefinition def) + { + TextIndex index = new TextIndexLuceneMultiLingual(directory, def) ; + return index ; + } + /** * Create a text-indexed dataset, using Lucene * * @param base the base Dataset * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static Dataset createLucene(Dataset base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static Dataset createLucene(Dataset base, Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + TextIndex index = createLuceneIndex(directory, def, analyzer, queryAnalyzer) ; return create(base, index, true) ; } + /** + * Create a localized text-indexed dataset, using Lucene + * + * @param base the base Dataset + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static Dataset createLuceneFromLanguage(Dataset base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; + return create(base, index, true) ; + } + + /** + * Create a multilingual text-indexed dataset, using Lucene + * + * @param base the base Dataset + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static Dataset createLuceneMultilingual(Dataset base, File directory, EntityDefinition def) + { + TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + return create(base, index, true) ; + } + /** * Create a text-indexed dataset, using Lucene * * @param base the base DatasetGraph * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public static DatasetGraph createLucene(DatasetGraph base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + public static DatasetGraph createLucene(DatasetGraph base, Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { - TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + TextIndex index = createLuceneIndex(directory, def, analyzer, queryAnalyzer) ; return create(base, index, true) ; } - /** Create a Solr TextIndex */ + /** + * Create a localized text-indexed dataset, using Lucene + * + * @param base the base DatasetGraph + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param lang The language related with the analyzer. + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static DatasetGraph createLuceneFromLanguage(DatasetGraph base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; + return create(base, index, true) ; + } + + /** + * Create a multilingual text-indexed dataset, using Lucene + * + * @param base the base DatasetGraph + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + */ + public static DatasetGraph createLuceneMultilingual(DatasetGraph base, File directory, EntityDefinition def) + { + TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + return create(base, index, true) ; + } + + + /** Create a Solr TextIndex */ public static TextIndex createSolrIndex(SolrServer server, EntityDefinition entMap) { TextIndex index = new TextIndexSolr(server, entMap) ; diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java index b295148d961..8346261ffe0 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java @@ -63,7 +63,12 @@ public void change(QuadAction qaction, Node g, Node s, Node p, Node o) { Entity entity = TextQueryFuncs.entityFromQuad(defn, g, s, p, o) ; // Null means does not match defn if ( entity != null ) { - indexer.addEntity(entity) ; + if (indexer instanceof TextIndexLuceneMultiLingual) { + String lang = o.getLiteral().language(); + ((TextIndexLuceneMultiLingual)indexer).addEntity(entity, lang); + } + else + indexer.addEntity(entity) ; // Auto commit the entity if we aren't in a transaction if (!inTransaction.get()) { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java index caebea18b74..77f09908396 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java @@ -89,9 +89,10 @@ public class TextIndexLucene implements TextIndex { * * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index + * @param analyzer The analyzer to be used to index literals. If null, then the standard analyzer will be used. * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. */ - public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) { + public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer analyzer, Analyzer queryAnalyzer) { this.directory = directory ; this.docDef = def ; @@ -103,14 +104,15 @@ public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer query analyzerPerField.put(def.getGraphField(), new KeywordAnalyzer()) ; for (String field : def.fields()) { - Analyzer analyzer = def.getAnalyzer(field); - if (analyzer != null) { - analyzerPerField.put(field, analyzer); + Analyzer _analyzer = def.getAnalyzer(field); + if (_analyzer != null) { + analyzerPerField.put(field, _analyzer); } } - this.analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(VER), analyzerPerField) ; - this.queryAnalyzer = (null != queryAnalyzer) ? queryAnalyzer : analyzer ; + this.analyzer = new PerFieldAnalyzerWrapper( + (null != analyzer) ? analyzer : new StandardAnalyzer(VER), analyzerPerField) ; + this.queryAnalyzer = (null != queryAnalyzer) ? queryAnalyzer : this.analyzer ; openIndexWriter(); } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java new file mode 100644 index 00000000000..0e9608693d8 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import com.hp.hpl.jena.graph.Node; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class TextIndexLuceneMultiLingual implements TextIndex { + + Hashtable indexes; + File indexDir; + private final EntityDefinition docDef; + + public TextIndexLuceneMultiLingual(File directory, EntityDefinition def) { + docDef = def; + indexes = new Hashtable<>(); + + try { + //default index created first. Localized index will be created on the fly. + indexDir = directory; + Directory dir = FSDirectory.open(indexDir); + TextIndex index = new TextIndexLucene(dir, def, null, null); + indexes.put("default", index); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + public Collection getIndexes() { + return indexes.values(); + } + + TextIndex getIndex(String lang) { + lang = LuceneUtil.getISO2Language(lang); + if (lang == null) + lang = "default"; + + if (!indexes.containsKey(lang)) { + //dynamic creation of localized index + try { + Analyzer analyzer = LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER); + if (analyzer != null) { + File indexDirLang = new File(indexDir, lang); + Directory dir = FSDirectory.open(indexDirLang); + TextIndex index = new TextIndexLucene(dir, docDef, analyzer, null); + indexes.put(lang, index); + } + else + lang = "default"; + } catch (IOException e) { + e.printStackTrace(); + } + } + + return indexes.get(lang); + } + + @Override + public void prepareCommit() { + for (TextIndex index : indexes.values()) + index.prepareCommit(); + } + + @Override + public void commit() { + for (TextIndex index : indexes.values()) + index.commit(); + } + + @Override + public void rollback() { + for (TextIndex index : indexes.values()) + index.rollback(); + } + + @Override + public void addEntity(Entity entity) { + } + + public void addEntity(Entity entity, String lang) { + getIndex(lang).addEntity(entity); + } + + @Override + public void updateEntity(Entity entity) { + + } + + @Override + public Map get(String uri) { + return null; + } + + @Override + public List query(String qs, int limit) { + return null; + } + + @Override + public List query(String qs) { + return null; + } + + @Override + public EntityDefinition getDocDef() { + return docDef; + } + + @Override + public void close() { + for (TextIndex index : indexes.values()) + index.close(); + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java index a17308b4a63..f2658e2c7a8 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java @@ -18,6 +18,7 @@ package org.apache.jena.query.text ; +import java.util.Iterator; import java.util.List ; import org.apache.jena.atlas.iterator.Iter ; @@ -77,6 +78,16 @@ public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, if (list.size() > 4) throw new QueryBuildException("Too many arguments in list : " + list) ; } + + // If retrieved index is an instance of TextIndexLuceneMultiLingual, we need to switch with the right index. + // The pattern is : + // ?uri text:query (property 'string' ['lang:language']) + // ex : ?uri text:query (rdfs:label 'livre' 'lang:fr') + // note: default index is the unlocalized index (if lang arg is not present). + if (server instanceof TextIndexLuceneMultiLingual) { + String lang = getArg("lang", argObject); + server = ((TextIndexLuceneMultiLingual)server).getIndex(lang); + } } private static TextIndex chooseTextIndex(DatasetGraph dsg) { @@ -101,6 +112,18 @@ private static TextIndex chooseTextIndex(DatasetGraph dsg) { return null ; } + private String getArg(String prefix, PropFuncArg argObject) { + for (Iterator it = argObject.getArgList().iterator(); it.hasNext(); ) { + Node node = (Node)it.next(); + if (node.isLiteral()) { + String arg = node.getLiteral().toString(); + if (arg.startsWith(prefix + ":")) + return arg.split(":")[1]; + } + } + return null; + } + @Override public QueryIterator exec(Binding binding, PropFuncArg argSubject, Node predicate, PropFuncArg argObject, ExecutionContext execCxt) { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java index 670c530751f..75a27bb61af 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java @@ -93,7 +93,7 @@ public TextIndex open(Assembler a, Resource root, Mode mode) { Resource r = GraphUtils.getResourceValue(root, pEntityMap) ; EntityDefinition docDef = (EntityDefinition)a.open(r) ; - return TextDatasetFactory.createLuceneIndex(directory, docDef, queryAnalyzer) ; + return TextDatasetFactory.createLuceneIndex(directory, docDef, null, queryAnalyzer) ; } catch (IOException e) { IO.exception(e) ; return null ; diff --git a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java index 1011ba4ed67..417fe1cfc9d 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java @@ -39,7 +39,7 @@ public void init() { Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", "graph", RDFS.label.asNode()) ; eDef.set("comment", RDFS.comment.asNode()) ; // some tests require indexing rdfs:comment - TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; dataset = TextDatasetFactory.create(ds1, tidx) ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java index e045477de5f..6a22802ac5b 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java @@ -117,7 +117,7 @@ public static Dataset createCode() { Directory dir = new RAMDirectory() ; // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; return ds ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java index b14526ee315..ffc746b1a25 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java @@ -63,7 +63,7 @@ public class TestLuceneWithMultipleThreads @Test public void testReadInMiddleOfWrite() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newSingleThreadExecutor(); final Future f = execService.submit(new Runnable() @@ -119,7 +119,7 @@ public void run() @Test public void testWriteInMiddleOfRead() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); final int numReads = 10; final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newFixedThreadPool(10); @@ -187,7 +187,7 @@ public void run() @Test public void testIsolation() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null, null); final int numReaders = 2; final List> futures = new ArrayList>(numReaders); diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java index a80d399c06e..cb592714d27 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java @@ -40,7 +40,7 @@ private static Dataset create() { Dataset ds1 = TDBFactory.createDataset() ; Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", RDFS.label) ; - TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; Dataset ds = TextDatasetFactory.create(ds1, tidx) ; return ds ; } From a125642e1f6bd8e9ec732784d897df6c4e7cd28c Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Wed, 22 Apr 2015 15:44:31 -0400 Subject: [PATCH 5/9] original pom.xml --- jena-text/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/jena-text/pom.xml b/jena-text/pom.xml index 3eb2d347ec5..b33ca6fdfc1 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -18,7 +18,6 @@ 4.0.0 - licef jena-text jar Apache Jena - SPARQL Text Search From afd38892930d4145ce35a8b5f3ed2aabf20cf21e Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Tue, 28 Apr 2015 12:01:38 -0400 Subject: [PATCH 6/9] initial pom.xml --- jena-text/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/jena-text/pom.xml b/jena-text/pom.xml index c3d96c06ca7..af5b671a640 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -18,7 +18,6 @@ 4.0.0 - licef jena-text jar Apache Jena - SPARQL Text Search From 14b10318576f67a318cdc643dbbca15197b7bba9 Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Wed, 29 Apr 2015 11:54:07 -0400 Subject: [PATCH 7/9] Revert to previous method signatures (for Lucene index creation) + better implementation of addEntity inside TextIndexLuceneMultilingual --- .../main/java/examples/JenaTextExample1.java | 2 +- .../org/apache/jena/query/text/Entity.java | 9 +++- .../jena/query/text/TextDatasetFactory.java | 45 ++++++++++++++++++- .../query/text/TextDocProducerTriples.java | 7 +-- .../jena/query/text/TextIndexLucene.java | 11 +++++ .../text/TextIndexLuceneMultiLingual.java | 7 ++- .../jena/query/text/TextQueryFuncs.java | 9 +++- .../assembler/TextIndexLuceneAssembler.java | 2 +- ...ctTestDatasetWithLuceneGraphTextIndex.java | 2 +- .../jena/query/text/TestBuildTextDataset.java | 2 +- .../text/TestLuceneWithMultipleThreads.java | 6 +-- .../apache/jena/query/text/TestTextTDB.java | 2 +- 12 files changed, 82 insertions(+), 22 deletions(-) diff --git a/jena-text/src/main/java/examples/JenaTextExample1.java b/jena-text/src/main/java/examples/JenaTextExample1.java index ab9a4b36e94..6ad2c26e328 100644 --- a/jena-text/src/main/java/examples/JenaTextExample1.java +++ b/jena-text/src/main/java/examples/JenaTextExample1.java @@ -64,7 +64,7 @@ public static Dataset createCode() Directory dir = new RAMDirectory(); // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; return ds ; } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/Entity.java b/jena-text/src/main/java/org/apache/jena/query/text/Entity.java index d770c5aa9db..c7757a31b79 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/Entity.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/Entity.java @@ -25,13 +25,18 @@ public class Entity { private final String id ; private final String graph ; + private final String language ; private final Map map = new HashMap<>() ; public Entity(String entityId, String entityGraph) { + this(entityId, entityGraph, null); + } + + public Entity(String entityId, String entityGraph, String lang) { this.id = entityId ; this.graph = entityGraph; + this.language = lang; } - /** @deprecated Use {@linkplain #Entity(String, String)} */ @Deprecated public Entity(String entityId) { this(entityId, null) ; } @@ -40,6 +45,8 @@ public Entity(String entityId, String entityGraph) { public String getGraph() { return graph ; } + public String getLanguage() { return language ; } + public void put(String key, Object value) { map.put(key, value) ; } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java index 41e70f53a07..0810d9d431c 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java @@ -85,7 +85,20 @@ public static DatasetGraph create(DatasetGraph dsg, TextIndex textIndex, boolean return dsgt ; } - + + /** + * Create a Lucene TextIndex + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static TextIndex createLuceneIndex(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + { + TextIndex index = new TextIndexLucene(directory, def, queryAnalyzer) ; + return index ; + } + /** * Create a Lucene TextIndex * @@ -125,7 +138,21 @@ public static TextIndex createLuceneIndexMultiLingual(File directory, EntityDefi return index ; } - /** + /** + * Create a text-indexed dataset, using Lucene + * + * @param base the base Dataset + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static Dataset createLucene(Dataset base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + return create(base, index, true) ; + } + + /** * Create a text-indexed dataset, using Lucene * * @param base the base Dataset @@ -168,6 +195,20 @@ public static Dataset createLuceneMultilingual(Dataset base, File directory, Ent return create(base, index, true) ; } + /** + * Create a text-indexed dataset, using Lucene + * + * @param base the base DatasetGraph + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public static DatasetGraph createLucene(DatasetGraph base, Directory directory, EntityDefinition def, Analyzer queryAnalyzer) + { + TextIndex index = createLuceneIndex(directory, def, queryAnalyzer) ; + return create(base, index, true) ; + } + /** * Create a text-indexed dataset, using Lucene * diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java index 9de20098ad8..c0bcabd16b7 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDocProducerTriples.java @@ -62,12 +62,7 @@ public void change(QuadAction qaction, Node g, Node s, Node p, Node o) { Entity entity = TextQueryFuncs.entityFromQuad(defn, g, s, p, o) ; // Null means does not match defn if ( entity != null ) { - if (indexer instanceof TextIndexLuceneMultiLingual) { - String lang = o.getLiteral().language(); - ((TextIndexLuceneMultiLingual)indexer).addEntity(entity, lang); - } - else - indexer.addEntity(entity) ; + indexer.addEntity(entity) ; // Auto commit the entity if we aren't in a transaction if (!inTransaction.get()) { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java index 7a8381b03af..f0f5abb4cf4 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java @@ -83,6 +83,17 @@ public class TextIndexLucene implements TextIndex { // at a time (enforced elsewhere). private volatile IndexWriter indexWriter ; + /** + * Constructs a new TextIndexLucene. + * + * @param directory The Lucene Directory for the index + * @param def The EntityDefinition that defines how entities are stored in the index + * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. + */ + public TextIndexLucene(Directory directory, EntityDefinition def, Analyzer queryAnalyzer) { + this(directory, def, null, queryAnalyzer); + } + /** * Constructs a new TextIndexLucene. * diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java index 410eb008053..8a2f11b9ed6 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java @@ -98,15 +98,14 @@ public void rollback() { @Override public void addEntity(Entity entity) { - } - - public void addEntity(Entity entity, String lang) { + String lang = entity.getLanguage(); getIndex(lang).addEntity(entity); } @Override public void updateEntity(Entity entity) { - + String lang = entity.getLanguage(); + getIndex(lang).updateEntity(entity); } @Override diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryFuncs.java b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryFuncs.java index 512297ec1b5..d628c4a3ec1 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryFuncs.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryFuncs.java @@ -46,6 +46,12 @@ public static String graphNodeToString(Node g) { return nodeToString(g) ; } + /** retrieve language (if exists) if object is literal */ + public static String getLiteralLanguage(Node o) { + String lang = o.getLiteral().language(); + return lang; + } + private static String nodeToString(Node n) { return (n.isURI() ) ? n.getURI() : "_:" + n.getBlankNodeLabel() ; } @@ -77,7 +83,8 @@ public static Entity entityFromQuad(EntityDefinition defn , Node g , Node s , No String x = TextQueryFuncs.subjectToString(s) ; String graphText = TextQueryFuncs.graphNodeToString(g) ; - Entity entity = new Entity(x, graphText) ; + String language = TextQueryFuncs.getLiteralLanguage(o) ; + Entity entity = new Entity(x, graphText, language) ; String graphField = defn.getGraphField() ; if ( defn.getGraphField() != null ) entity.put(graphField, graphText) ; diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java index 88b42eba584..361841c0580 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java @@ -92,7 +92,7 @@ public TextIndex open(Assembler a, Resource root, Mode mode) { Resource r = GraphUtils.getResourceValue(root, pEntityMap) ; EntityDefinition docDef = (EntityDefinition)a.open(r) ; - return TextDatasetFactory.createLuceneIndex(directory, docDef, null, queryAnalyzer) ; + return TextDatasetFactory.createLuceneIndex(directory, docDef, queryAnalyzer) ; } catch (IOException e) { IO.exception(e) ; return null ; diff --git a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java index dc9f5501a95..56a81b6f796 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/AbstractTestDatasetWithLuceneGraphTextIndex.java @@ -38,7 +38,7 @@ public void init() { Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", "graph", RDFS.label.asNode()) ; eDef.set("comment", RDFS.comment.asNode()) ; // some tests require indexing rdfs:comment - TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; dataset = TextDatasetFactory.create(ds1, tidx) ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java index ec00e149bae..2c3564dac32 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestBuildTextDataset.java @@ -116,7 +116,7 @@ public static Dataset createCode() { Directory dir = new RAMDirectory() ; // Join together into a dataset - Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null, null) ; + Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef, null) ; return ds ; } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java index 0d9381e27e9..dc02671d145 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestLuceneWithMultipleThreads.java @@ -56,7 +56,7 @@ public class TestLuceneWithMultipleThreads @Test public void testReadInMiddleOfWrite() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newSingleThreadExecutor(); final Future f = execService.submit(new Runnable() @@ -112,7 +112,7 @@ public void run() @Test public void testWriteInMiddleOfRead() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(new GraphStoreNullTransactional(), new RAMDirectory(), entDef, null); final int numReads = 10; final Dataset ds = DatasetFactory.create(dsg); final ExecutorService execService = Executors.newFixedThreadPool(10); @@ -180,7 +180,7 @@ public void run() @Test public void testIsolation() throws InterruptedException, ExecutionException { - final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null, null); + final DatasetGraphText dsg = (DatasetGraphText)TextDatasetFactory.createLucene(DatasetGraphFactory.createMem(), new RAMDirectory(), entDef, null); final int numReaders = 2; final List> futures = new ArrayList>(numReaders); diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java index dd2acfd48c3..f3307f0e733 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextTDB.java @@ -39,7 +39,7 @@ private static Dataset create() { Dataset ds1 = TDBFactory.createDataset() ; Directory dir = new RAMDirectory() ; EntityDefinition eDef = new EntityDefinition("iri", "text", RDFS.label) ; - TextIndex tidx = new TextIndexLucene(dir, eDef, null, null) ; + TextIndex tidx = new TextIndexLucene(dir, eDef, null) ; Dataset ds = TextDatasetFactory.create(ds1, tidx) ; return ds ; } From b3365dc214c577aa890eb9f1aaf809c6eadba8af Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Thu, 7 May 2015 14:31:49 -0400 Subject: [PATCH 8/9] creation of 2 new jena-text assemblers : - for Lucene multilingual index - for a dynamic localized analyzer --- jena-fuseki1/config-tdb-text.ttl | 1 + .../apache/jena/query/text/LuceneUtil.java | 2 +- .../jena/query/text/TextDatasetFactory.java | 55 +------ ....java => TextIndexLuceneMultilingual.java} | 35 +++-- .../apache/jena/query/text/TextQueryPF.java | 37 +++-- .../assembler/LocalizedAnalyzerAssembler.java | 60 +++++++ .../query/text/assembler/TextAssembler.java | 2 + .../TextIndexLuceneMultilingualAssembler.java | 87 +++++++++++ .../jena/query/text/assembler/TextVocab.java | 3 + .../org/apache/jena/query/text/TS_Text.java | 2 + .../TestDatasetWithLocalizedAnalyzer.java | 147 ++++++++++++++++++ ...atasetWithLuceneMultilingualTextIndex.java | 112 +++++++++++++ 12 files changed, 465 insertions(+), 78 deletions(-) rename jena-text/src/main/java/org/apache/jena/query/text/{TextIndexLuceneMultiLingual.java => TextIndexLuceneMultilingual.java} (75%) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/assembler/LocalizedAnalyzerAssembler.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneMultilingualAssembler.java create mode 100644 jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java create mode 100644 jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java diff --git a/jena-fuseki1/config-tdb-text.ttl b/jena-fuseki1/config-tdb-text.ttl index e0ff09dc91e..c4567f5be83 100644 --- a/jena-fuseki1/config-tdb-text.ttl +++ b/jena-fuseki1/config-tdb-text.ttl @@ -46,6 +46,7 @@ tdb:GraphTDB rdfs:subClassOf ja:Model . text:TextDataset rdfs:subClassOf ja:RDFDataset . #text:TextIndexSolr rdfs:subClassOf text:TextIndex . text:TextIndexLucene rdfs:subClassOf text:TextIndex . +#text:TextIndexLuceneMultilingual rdfs:subClassOf text:TextIndex . ## --------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java index a4c6145660b..7c41ea9b97e 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java @@ -37,7 +37,7 @@ public static Analyzer createAnalyzer(String lang, Version ver) { return null; try { - Class className = analyzers.get(lang); + Class className = analyzers.get(lang); if (className == null) return null; Constructor constructor = className.getConstructor(Version.class); diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java index 0810d9d431c..6f7b43fc088 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java @@ -113,28 +113,15 @@ public static TextIndex createLuceneIndex(Directory directory, EntityDefinition return index ; } - /** - * Create a localized Lucene TextIndex - * - * @param directory The Lucene Directory for the index - * @param def The EntityDefinition that defines how entities are stored in the index - * @param lang The language related with the analyzer. - * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. - */ - public static TextIndex createLuceneIndexFromLanguage(Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) - { - return createLuceneIndex(directory, def, LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER), queryAnalyzer); - } - /** * Create a multilingual Lucene TextIndex * * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index */ - public static TextIndex createLuceneIndexMultiLingual(File directory, EntityDefinition def) + public static TextIndex createLuceneIndexMultilingual(Directory directory, EntityDefinition def) { - TextIndex index = new TextIndexLuceneMultiLingual(directory, def) ; + TextIndex index = new TextIndexLuceneMultilingual(directory, def) ; return index ; } @@ -167,21 +154,6 @@ public static Dataset createLucene(Dataset base, Directory directory, EntityDefi return create(base, index, true) ; } - /** - * Create a localized text-indexed dataset, using Lucene - * - * @param base the base Dataset - * @param directory The Lucene Directory for the index - * @param def The EntityDefinition that defines how entities are stored in the index - * @param lang The language related with the analyzer. - * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. - */ - public static Dataset createLuceneFromLanguage(Dataset base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) - { - TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; - return create(base, index, true) ; - } - /** * Create a multilingual text-indexed dataset, using Lucene * @@ -189,9 +161,9 @@ public static Dataset createLuceneFromLanguage(Dataset base, Directory directory * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index */ - public static Dataset createLuceneMultilingual(Dataset base, File directory, EntityDefinition def) + public static Dataset createLuceneMultilingual(Dataset base, Directory directory, EntityDefinition def) { - TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + TextIndex index = createLuceneIndexMultilingual(directory, def) ; return create(base, index, true) ; } @@ -224,21 +196,6 @@ public static DatasetGraph createLucene(DatasetGraph base, Directory directory, return create(base, index, true) ; } - /** - * Create a localized text-indexed dataset, using Lucene - * - * @param base the base DatasetGraph - * @param directory The Lucene Directory for the index - * @param def The EntityDefinition that defines how entities are stored in the index - * @param lang The language related with the analyzer. - * @param queryAnalyzer The analyzer to be used to find terms in the query text. If null, then the analyzer defined by the EntityDefinition will be used. - */ - public static DatasetGraph createLuceneFromLanguage(DatasetGraph base, Directory directory, EntityDefinition def, String lang, Analyzer queryAnalyzer) - { - TextIndex index = createLuceneIndexFromLanguage(directory, def, lang, queryAnalyzer) ; - return create(base, index, true) ; - } - /** * Create a multilingual text-indexed dataset, using Lucene * @@ -246,9 +203,9 @@ public static DatasetGraph createLuceneFromLanguage(DatasetGraph base, Directory * @param directory The Lucene Directory for the index * @param def The EntityDefinition that defines how entities are stored in the index */ - public static DatasetGraph createLuceneMultilingual(DatasetGraph base, File directory, EntityDefinition def) + public static DatasetGraph createLuceneMultilingual(DatasetGraph base, Directory directory, EntityDefinition def) { - TextIndex index = createLuceneIndexMultiLingual(directory, def) ; + TextIndex index = createLuceneIndexMultilingual(directory, def) ; return create(base, index, true) ; } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java similarity index 75% rename from jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java rename to jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java index 8a2f11b9ed6..0af28ff9639 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultiLingual.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java @@ -22,31 +22,26 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; import java.io.File; import java.io.IOException; import java.util.*; -public class TextIndexLuceneMultiLingual implements TextIndex { +public class TextIndexLuceneMultilingual implements TextIndex { Hashtable indexes; - File indexDir; private final EntityDefinition docDef; + private final Directory directory ; - public TextIndexLuceneMultiLingual(File directory, EntityDefinition def) { - docDef = def; + public TextIndexLuceneMultilingual(Directory directory, EntityDefinition def) { + this.directory = directory ; + this.docDef = def; indexes = new Hashtable<>(); - try { - //default index created first. Localized index will be created on the fly. - indexDir = directory; - Directory dir = FSDirectory.open(indexDir); - TextIndex index = new TextIndexLucene(dir, def, null, null); - indexes.put("default", index); - - } catch (IOException e) { - e.printStackTrace(); - } + //default index created first. Localized index will be created on the fly. + TextIndex index = new TextIndexLucene(directory, def, null, null); + indexes.put("default", index); } public Collection getIndexes() { @@ -63,9 +58,15 @@ TextIndex getIndex(String lang) { try { Analyzer analyzer = LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER); if (analyzer != null) { - File indexDirLang = new File(indexDir, lang); - Directory dir = FSDirectory.open(indexDirLang); - TextIndex index = new TextIndexLucene(dir, docDef, analyzer, null); + Directory langDir; + if (directory instanceof FSDirectory) { + File dir = ((FSDirectory) directory).getDirectory(); + File indexDirLang = new File(dir, lang); + langDir = FSDirectory.open(indexDirLang); + } + else + langDir = new RAMDirectory(); + TextIndex index = new TextIndexLucene(langDir, docDef, analyzer, null); indexes.put(lang, index); } else diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java index b023c5a4396..83aaff0fff7 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java @@ -69,7 +69,15 @@ public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, if (!argSubject.isNode()) throw new QueryBuildException("Subject is not a single node: " + argSubject) ; + //extra lang arg for possible multilingual index + String lang = null; + if (argObject.isList()) { + //extract of extra lang arg if present. + //For the moment, arg is removed from the list to avoid conflict with order and args length + //but should be managed with others args + lang = extractArg("lang", argObject); + List list = argObject.getArgList() ; if (list.size() == 0) throw new QueryBuildException("Zero-length argument list") ; @@ -78,15 +86,14 @@ public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, throw new QueryBuildException("Too many arguments in list : " + list) ; } - // If retrieved index is an instance of TextIndexLuceneMultiLingual, we need to switch with the right index. + // If retrieved index is an instance of TextIndexLuceneMultilingual, + // we need to switch with the right localized index. // The pattern is : - // ?uri text:query (property 'string' ['lang:language']) - // ex : ?uri text:query (rdfs:label 'livre' 'lang:fr') + // ?uri text:query (property 'string' ['lang:language'] [limit]) + // ex : ?uri text:query (rdfs:label 'book' 'lang:en') // note: default index is the unlocalized index (if lang arg is not present). - if (server instanceof TextIndexLuceneMultiLingual) { - String lang = getArg("lang", argObject); - server = ((TextIndexLuceneMultiLingual)server).getIndex(lang); - } + if (server instanceof TextIndexLuceneMultilingual) + server = ((TextIndexLuceneMultilingual)server).getIndex(lang); } private static TextIndex chooseTextIndex(DatasetGraph dsg) { @@ -111,16 +118,24 @@ private static TextIndex chooseTextIndex(DatasetGraph dsg) { return null ; } - private String getArg(String prefix, PropFuncArg argObject) { + private String extractArg(String prefix, PropFuncArg argObject) { + String value = null; + int pos = 0; for (Iterator it = argObject.getArgList().iterator(); it.hasNext(); ) { Node node = (Node)it.next(); if (node.isLiteral()) { String arg = node.getLiteral().toString(); - if (arg.startsWith(prefix + ":")) - return arg.split(":")[1]; + if (arg.startsWith(prefix + ":")) { + value = arg.split(":")[1]; + break; + } } + pos++; } - return null; + if (value != null) + argObject.getArgList().remove(pos); + + return value; } @Override diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/LocalizedAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/LocalizedAnalyzerAssembler.java new file mode 100644 index 00000000000..a72c2a9c27e --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/LocalizedAnalyzerAssembler.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.assembler.Mode; +import org.apache.jena.assembler.assemblers.AssemblerBase; +import org.apache.jena.query.text.LuceneUtil; +import org.apache.jena.query.text.TextIndexException; +import org.apache.jena.query.text.TextIndexLucene; +import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.rdf.model.Resource; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +/** + * Assembler to create localized analyzer. + */ +public class LocalizedAnalyzerAssembler extends AssemblerBase { + /* + text:map ( + [ text:field "text" ; + text:predicate rdfs:label; + text:analyzer [ + a lucene:LocalizedAnalyzer ; + text:language "en" ; + ] + . + */ + + @Override + public Analyzer open(Assembler a, Resource root, Mode mode) { + if (root.hasProperty(TextVocab.pLanguage)) { + RDFNode node = root.getProperty(TextVocab.pLanguage).getObject(); + if (! node.isLiteral()) { + throw new TextIndexException("text:language property must be a string : " + node); + } + String lang = node.toString(); + return LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER); + } else { + return new StandardAnalyzer(TextIndexLucene.VER); + } + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java index d901bc73b98..790dac7cf15 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java @@ -30,10 +30,12 @@ public static void init() Assembler.general.implementWith(TextVocab.entityMap, new EntityDefinitionAssembler()) ; Assembler.general.implementWith(TextVocab.textIndexSolr, new TextIndexSolrAssembler()) ; Assembler.general.implementWith(TextVocab.textIndexLucene, new TextIndexLuceneAssembler()) ; + Assembler.general.implementWith(TextVocab.textIndexLuceneMultilingual, new TextIndexLuceneMultilingualAssembler()) ; Assembler.general.implementWith(TextVocab.standardAnalyzer, new StandardAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.simpleAnalyzer, new SimpleAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ; + Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ; } } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneMultilingualAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneMultilingualAssembler.java new file mode 100644 index 00000000000..a36fcbe5725 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneMultilingualAssembler.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.assembler.Mode; +import org.apache.jena.assembler.assemblers.AssemblerBase; +import org.apache.jena.atlas.io.IO; +import org.apache.jena.atlas.lib.IRILib; +import org.apache.jena.query.text.EntityDefinition; +import org.apache.jena.query.text.TextDatasetFactory; +import org.apache.jena.query.text.TextIndex; +import org.apache.jena.query.text.TextIndexException; +import org.apache.jena.rdf.model.RDFNode; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.sparql.util.graph.GraphUtils; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; + +import java.io.File; +import java.io.IOException; + +import static org.apache.jena.query.text.assembler.TextVocab.pDirectory; +import static org.apache.jena.query.text.assembler.TextVocab.pEntityMap; + +public class TextIndexLuceneMultilingualAssembler extends AssemblerBase { + /* + <#index> a :TextIndexLuceneMultilingual ; + #text:directory "mem" ; + #text:directory "DIR" ; + text:directory ; + text:entityMap <#endMap> ; + . + */ + + @SuppressWarnings("resource") + @Override + public TextIndex open(Assembler a, Resource root, Mode mode) { + try { + if ( !GraphUtils.exactlyOneProperty(root, pDirectory) ) + throw new TextIndexException("No 'text:directory' property on " + root) ; + + Directory directory ; + + RDFNode n = root.getProperty(pDirectory).getObject() ; + if ( n.isLiteral() ) { + String literalValue = n.asLiteral().getLexicalForm() ; + if (literalValue.equals("mem")) { + directory = new RAMDirectory() ; + } else { + File dir = new File(literalValue) ; + directory = FSDirectory.open(dir) ; + } + } else { + Resource x = n.asResource() ; + String path = IRILib.IRIToFilename(x.getURI()) ; + File dir = new File(path) ; + directory = FSDirectory.open(dir) ; + } + + Resource r = GraphUtils.getResourceValue(root, pEntityMap) ; + EntityDefinition docDef = (EntityDefinition)a.open(r) ; + + return TextDatasetFactory.createLuceneIndexMultilingual(directory, docDef) ; + } catch (IOException e) { + IO.exception(e) ; + return null ; + } + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java index a835a6f39bc..79c223e5f17 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java @@ -36,6 +36,8 @@ public class TextVocab public static final Resource textIndex = Vocab.resource(NS, "TextIndex") ; public static final Resource textIndexSolr = Vocab.resource(NS, "TextIndexSolr") ; public static final Resource textIndexLucene = Vocab.resource(NS, "TextIndexLucene") ; + public static final Resource textIndexLuceneMultilingual = Vocab.resource(NS, "TextIndexLuceneMultilingual") ; + public static final Property pLanguage = Vocab.property(NS, "language") ; public static final Property pServer = Vocab.property(NS, "server") ; // Solr public static final Property pDirectory = Vocab.property(NS, "directory") ; // Lucene public static final Property pQueryAnalyzer = Vocab.property(NS, "queryAnalyzer") ; @@ -57,6 +59,7 @@ public class TextVocab public static final Resource simpleAnalyzer = Vocab.resource(NS, "SimpleAnalyzer"); public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer"); public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer"); + public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer"); } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java index 115b493bd31..02196752a4a 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java @@ -30,6 +30,7 @@ @SuiteClasses({ TestBuildTextDataset.class , TestDatasetWithLuceneTextIndex.class + , TestDatasetWithLuceneMultilingualTextIndex.class , TestDatasetWithLuceneGraphTextIndex.class // Embedded solr not supported @@ -45,6 +46,7 @@ , TestDatasetWithKeywordAnalyzer.class , TestDatasetWithLowerCaseKeywordAnalyzer.class , TestLuceneWithMultipleThreads.class + , TestDatasetWithLocalizedAnalyzer.class }) public class TS_Text diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java new file mode 100644 index 00000000000..b663197fc47 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.atlas.lib.StrUtils; +import org.apache.jena.query.Dataset; +import org.apache.jena.query.text.assembler.TextAssembler; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Resource; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * This class defines a setup configuration for a dataset that uses a simple analyzer with a Lucene index. + */ +public class TestDatasetWithLocalizedAnalyzer extends AbstractTestDatasetWithTextIndexBase { + private static final String INDEX_PATH = "target/test/TestDatasetWithLuceneIndex"; + private static final File indexDir = new File(INDEX_PATH); + + private static final String SPEC_BASE = "http://example.org/spec#"; + private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset"; + private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL; + private static final String SPEC; + static { + SPEC = StrUtils.strjoinNL( + "prefix rdfs: ", + "prefix ja: ", + "prefix tdb: ", + "prefix text: ", + "prefix : <" + SPEC_BASE + ">", + "", + "[] ja:loadClass \"org.apache.jena.query.text.TextQuery\" .", + "text:TextDataset rdfs:subClassOf ja:RDFDataset .", + "text:TextIndexLucene rdfs:subClassOf text:TextIndex .", + + ":" + SPEC_ROOT_LOCAL, + " a text:TextDataset ;", + " text:dataset :dataset ;", + " text:index :indexLucene ;", + " .", + "", + ":dataset", + " a ja:RDFDataset ;", + " ja:defaultGraph :graph ;", + ".", + ":graph", + " a ja:MemoryModel ;", + ".", + "", + ":indexLucene", + " a text:TextIndexLucene ;", + " text:directory ;", + " text:entityMap :entMap ;", + " .", + "", + ":entMap", + " a text:EntityMap ;", + " text:entityField \"uri\" ;", + " text:defaultField \"label\" ;", + " text:map (", + " [ text:field \"label\" ; ", + " text:predicate rdfs:label ;", + " text:analyzer [ " + + " a text:LocalizedAnalyzer ;" + + " text:language \"en\" " + + " ]", + " ]", + " [ text:field \"label\" ; text:predicate rdfs:label ]", + " [ text:field \"comment\" ; text:predicate rdfs:comment ]", + " ) ." + ); + } + + public void init() { + Reader reader = new StringReader(SPEC); + Model specModel = ModelFactory.createDefaultModel(); + specModel.read(reader, "", "TURTLE"); + TextAssembler.init(); +// deleteOldFiles(); + indexDir.mkdirs(); + Resource root = specModel.getResource(SPEC_ROOT_URI); + dataset = (Dataset) Assembler.general.open(root); + } + + + public void deleteOldFiles() { + dataset.close(); + if (indexDir.exists()) TextSearchUtil.emptyAndDeleteDirectory(indexDir); + } + + @Before + public void beforeClass() { + init(); + } + + @After + public void afterClass() { + deleteOldFiles(); + } + + @Test + public void testLocalizedAnalyzer() { + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testLocalizedAnalyzer>", + " rdfs:label 'This is my book'", + "." + ); + // the localized analyzer should use localized lucene index facilities (stop words, stemming...) + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label 'books' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList("http://example.org/data/resource/testLocalizedAnalyzer")) ; + doTestSearch(turtle, queryString, expectedURIs); + } +} diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java new file mode 100644 index 00000000000..c163ec17bc5 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.atlas.lib.StrUtils; +import org.apache.jena.query.Dataset; +import org.apache.jena.query.text.assembler.TextAssembler; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Resource; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashSet; + +public class TestDatasetWithLuceneMultilingualTextIndex extends AbstractTestDatasetWithTextIndex { + + private static final String SPEC_BASE = "http://example.org/spec#"; + private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset"; + private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL; + private static final String SPEC; + static { + SPEC = StrUtils.strjoinNL( + "prefix rdfs: ", + "prefix ja: ", + "prefix tdb: ", + "prefix text: ", + "prefix : <" + SPEC_BASE + ">", + "", + "[] ja:loadClass \"org.apache.jena.query.text.TextQuery\" .", + "text:TextDataset rdfs:subClassOf ja:RDFDataset .", + "text:TextIndexLuceneMultilingual rdfs:subClassOf text:TextIndex .", + + ":" + SPEC_ROOT_LOCAL, + " a text:TextDataset ;", + " text:dataset :dataset ;", + " text:index :indexLucene ;", + " .", + "", + ":dataset", + " a ja:RDFDataset ;", + " ja:defaultGraph :graph ;", + ".", + ":graph", + " a ja:MemoryModel ;", + ".", + "", + ":indexLucene", + " a text:TextIndexLuceneMultilingual ;", + " text:directory \"mem\" ;", + " text:entityMap :entMap ;", + " .", + "", + ":entMap", + " a text:EntityMap ;", + " text:entityField \"uri\" ;", + " text:defaultField \"label\" ;", + " text:map (", + " [ text:field \"label\" ; text:predicate rdfs:label ]", + " [ text:field \"comment\" ; text:predicate rdfs:comment ]", + " ) ." + ); + } + + @Before + public void before() { + Reader reader = new StringReader(SPEC); + Model specModel = ModelFactory.createDefaultModel(); + specModel.read(reader, "", "TURTLE"); + TextAssembler.init(); + Resource root = specModel.getResource(SPEC_ROOT_URI); + dataset = (Dataset) Assembler.general.open(root); + } + + @After + public void after() { + dataset.close(); + } + + @Test + public void testNoResultsOnFirstCreateIndex(){ + String turtle = ""; + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label \"book\" \"lang:en\" 10 ) .", + "}" + ); + doTestSearch(turtle, queryString, new HashSet()); + } +} From a4a84bd61aac493872418080abd08b57c4077fc0 Mon Sep 17 00:00:00 2001 From: Alexis Miara Date: Mon, 11 May 2015 10:52:08 -0400 Subject: [PATCH 9/9] new tests + util adjustments --- .../apache/jena/query/text/LuceneUtil.java | 59 ++++++++++++++- .../text/TextIndexLuceneMultilingual.java | 2 +- .../TestDatasetWithLocalizedAnalyzer.java | 6 +- ...atasetWithLuceneMultilingualTextIndex.java | 73 +++++++++++++++++++ 4 files changed, 132 insertions(+), 8 deletions(-) diff --git a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java index 7c41ea9b97e..d83de784d28 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java @@ -31,6 +31,10 @@ public class LuceneUtil { initAnalyzerDefs(); } + public static Analyzer createAnalyzer(String lang) { + return createAnalyzer(lang, TextIndexLucene.VER); + } + public static Analyzer createAnalyzer(String lang, Version ver) { lang = getISO2Language(lang); if (lang == null) @@ -49,10 +53,57 @@ public static Analyzer createAnalyzer(String lang, Version ver) { } public static String getISO2Language(String lang) { - if (lang == null) - return null; - else - return lang.split("-")[0].toLowerCase(); + if (lang != null) { + lang = lang.split("-")[0].toLowerCase(); + if (lang.length() == 2) + return lang; + else { + if ("ara".equals(lang)) return "ar"; + if ("bul".equals(lang)) return "bg"; + if ("cat".equals(lang)) return "ca"; + if ("ces".equals(lang)) return "cs"; + if ("cze".equals(lang)) return "cs"; + if ("dan".equals(lang)) return "da"; + if ("deu".equals(lang)) return "de"; + if ("ger".equals(lang)) return "de"; + if ("ell".equals(lang)) return "el"; + if ("gre".equals(lang)) return "el"; + if ("eng".equals(lang)) return "en"; + if ("spa".equals(lang)) return "es"; + if ("eus".equals(lang)) return "eu"; + if ("baq".equals(lang)) return "eu"; + if ("fas".equals(lang)) return "fa"; + if ("per".equals(lang)) return "fa"; + if ("fin".equals(lang)) return "fi"; + if ("fra".equals(lang)) return "fr"; + if ("fre".equals(lang)) return "fr"; + if ("gle".equals(lang)) return "ga"; + if ("glg".equals(lang)) return "gl"; + if ("hin".equals(lang)) return "hi"; + if ("hun".equals(lang)) return "hu"; + if ("hye".equals(lang)) return "hy"; + if ("arm".equals(lang)) return "hy"; + if ("ind".equals(lang)) return "id"; + if ("ita".equals(lang)) return "it"; + if ("jpn".equals(lang)) return "jp"; + if ("kor".equals(lang)) return "ko"; + if ("lav".equals(lang)) return "lv"; + if ("nld".equals(lang)) return "nl"; + if ("dut".equals(lang)) return "nl"; + if ("nor".equals(lang)) return "no"; + if ("por".equals(lang)) return "pt"; + if ("ron".equals(lang)) return "ro"; + if ("rum".equals(lang)) return "ro"; + if ("rus".equals(lang)) return "ru"; + if ("swe".equals(lang)) return "sv"; + if ("tha".equals(lang)) return "th"; + if ("tur".equals(lang)) return "tr"; + if ("zho".equals(lang)) return "zh"; + if ("chi".equals(lang)) return "zh"; + } + } + + return null; } private static void initAnalyzerDefs() { diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java index 0af28ff9639..3f249968008 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java @@ -56,7 +56,7 @@ TextIndex getIndex(String lang) { if (!indexes.containsKey(lang)) { //dynamic creation of localized index try { - Analyzer analyzer = LuceneUtil.createAnalyzer(lang, TextIndexLucene.VER); + Analyzer analyzer = LuceneUtil.createAnalyzer(lang); if (analyzer != null) { Directory langDir; if (directory instanceof FSDirectory) { diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java index b663197fc47..5becf5b5eae 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLocalizedAnalyzer.java @@ -86,9 +86,9 @@ public class TestDatasetWithLocalizedAnalyzer extends AbstractTestDatasetWithTex " text:map (", " [ text:field \"label\" ; ", " text:predicate rdfs:label ;", - " text:analyzer [ " + - " a text:LocalizedAnalyzer ;" + - " text:language \"en\" " + + " text:analyzer [ ", + " a text:LocalizedAnalyzer ;", + " text:language \"en\" ", " ]", " ]", " [ text:field \"label\" ; text:predicate rdfs:label ]", diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java index c163ec17bc5..d0eb4782241 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLuceneMultilingualTextIndex.java @@ -31,7 +31,9 @@ import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; import java.util.HashSet; +import java.util.Set; public class TestDatasetWithLuceneMultilingualTextIndex extends AbstractTestDatasetWithTextIndex { @@ -109,4 +111,75 @@ public void testNoResultsOnFirstCreateIndex(){ ); doTestSearch(turtle, queryString, new HashSet()); } + + @Test + public void testRetrievingEnglishLocalizedResource(){ + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testEnglishLocalizedResource>", + " rdfs:label 'He offered me a gift'@en", + ".", + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testGermanLocalizedResource>", + " rdfs:label 'Er schluckte gift'@de", + "." + ); + // the localized analyzer should use localized lucene index facilities (stop words, stemming...) + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label 'gift' 'lang:en' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList("http://example.org/data/resource/testEnglishLocalizedResource")) ; + doTestSearch(turtle, queryString, expectedURIs); + } + + @Test + public void testRetrievingGermanLocalizedResource(){ + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testEnglishLocalizedResource>", + " rdfs:label 'He offered me a gift'@en", + ".", + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testGermanLocalizedResource>", + " rdfs:label 'Er schluckte gift'@de", + "." + ); + // the localized analyzer should use localized lucene index facilities (stop words, stemming...) + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label 'gift' 'lang:de' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList("http://example.org/data/resource/testGermanLocalizedResource")) ; + doTestSearch(turtle, queryString, expectedURIs); + } + + @Test + public void testEnglishStemming(){ + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + "testEnglishStemming>", + " rdfs:label 'I met some engineers'@en", + "." + ); + // the localized analyzer should use localized lucene index facilities (stop words, stemming...) + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label 'engineering' 'lang:en' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList("http://example.org/data/resource/testEnglishStemming")) ; + doTestSearch(turtle, queryString, expectedURIs); + } }