From f176e150bcd1161afda5f6ba52aef678127b7464 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 29 Oct 2015 17:08:54 +0200 Subject: [PATCH 01/11] first implementation of ASCIIFoldingLowerCaseKeywordAnalyzer --- .../ASCIIFoldingLowerCaseKeywordAnalyzer.java | 51 ++++++++++++++++ ...dingLowerCaseKeywordAnalyzerAssembler.java | 47 +++++++++++++++ .../query/text/assembler/TextAssembler.java | 1 + .../jena/query/text/assembler/TextVocab.java | 1 + .../org/apache/jena/query/text/TS_Text.java | 1 + ...hASCIIFoldingLowerCaseKeywordAnalyzer.java | 59 +++++++++++++++++++ .../assembler/TestEntityMapAssembler.java | 24 ++++++++ 7 files changed, 184 insertions(+) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java create mode 100644 jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java new file mode 100644 index 00000000000..f76b3b21ca4 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.analyzer ; + +import java.io.Reader ; + +import org.apache.lucene.analysis.Analyzer ; +import org.apache.lucene.analysis.core.KeywordTokenizer ; +import org.apache.lucene.analysis.core.LowerCaseFilter ; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter ; +import org.apache.lucene.util.Version ; + + +/** + * Lucene Analyzer implementation that works like KeywordAnalyzer (i.e. + * doesn't tokenize the input, keeps it as a single token), but forces text + * to lowercase and is thus case-insensitive. + */ + +public class ASCIIFoldingLowerCaseKeywordAnalyzer extends Analyzer { + private Version version; + + public ASCIIFoldingLowerCaseKeywordAnalyzer(Version ver) { + this.version = ver; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + KeywordTokenizer source = new KeywordTokenizer(reader); + LowerCaseFilter filter1 = new LowerCaseFilter(version, source); + ASCIIFoldingFilter filter2 = new ASCIIFoldingFilter(filter1); + return new TokenStreamComponents(source, filter2); + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java new file mode 100644 index 00000000000..cc4ed9d88a2 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler ; + +import org.apache.jena.assembler.Assembler ; +import org.apache.jena.assembler.Mode ; +import org.apache.jena.assembler.assemblers.AssemblerBase ; +import org.apache.jena.query.text.TextIndexLucene; +import org.apache.jena.query.text.analyzer.ASCIIFoldingLowerCaseKeywordAnalyzer; +import org.apache.jena.rdf.model.Resource ; +import org.apache.lucene.analysis.Analyzer; + +/** + * Assembler to create lowercase keyword analyzers. + */ +public class ASCIIFoldingLowerCaseKeywordAnalyzerAssembler extends AssemblerBase { + /* + text:map ( + [ text:field "text" ; + text:predicate rdfs:label; + text:analyzer [ + a lucene:LowerCaseKeywordAnalyzer ; ] + ] + . + */ + + @Override + public Analyzer open(Assembler a, Resource root, Mode mode) { + return new ASCIIFoldingLowerCaseKeywordAnalyzer(TextIndexLucene.VER); + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java index 021c0030713..4bcdc57019b 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java @@ -34,6 +34,7 @@ public static void init() Assembler.general.implementWith(TextVocab.simpleAnalyzer, new SimpleAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ; + Assembler.general.implementWith(TextVocab.asciiFoldingLowerCaseKeywordAnalyzer, new ASCIIFoldingLowerCaseKeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ; } } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java index 743d773fa6c..23bf0a47b9a 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java @@ -62,6 +62,7 @@ public class TextVocab public static final Resource simpleAnalyzer = Vocab.resource(NS, "SimpleAnalyzer"); public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer"); public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer"); + public static final Resource asciiFoldingLowerCaseKeywordAnalyzer = Vocab.resource(NS, "ASCIIFoldingLowerCaseKeywordAnalyzer"); public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer"); } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java index 07f141a1a39..bdf983c4a18 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java @@ -47,6 +47,7 @@ , TestDatasetWithStandardAnalyzer.class , TestDatasetWithKeywordAnalyzer.class , TestDatasetWithLowerCaseKeywordAnalyzer.class + , TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.class , TestLuceneWithMultipleThreads.class , TestDatasetWithLocalizedAnalyzer.class }) diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java new file mode 100644 index 00000000000..d76e80a6ff2 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import java.util.Arrays ; +import java.util.HashSet ; +import java.util.Set ; + +import org.apache.jena.atlas.lib.StrUtils ; +import org.junit.Before ; +import org.junit.Test ; + +/** + * This class defines a setup configuration for a dataset that uses an ASCII folding lowercase keyword analyzer with a Lucene index. + */ +public class TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer extends TestDatasetWithLowerCaseKeywordAnalyzer { + @Override + @Before + public void before() { + init("text:ASCIIFoldingLowerCaseKeywordAnalyzer"); + } + + @Test + public void testASCIIFoldingLowerCaseKeywordAnalyzerIsCaseAndAccentInsensitive() { + final String testName = "testASCIIFoldingLowerCaseKeywordAnalyzerIsCaseAndAccentInsensitive"; + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + testName + ">", + " rdfs:label 'Feeling a déjà vu'", + "." + ); + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label '\"feeling ä déja\"*' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList(RESOURCE_BASE + testName)) ; + doTestSearch(turtle, queryString, expectedURIs); + } +} diff --git a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java index ab3ed299953..3e6d9b08f62 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java @@ -29,6 +29,7 @@ import org.apache.jena.graph.Node ; import org.apache.jena.query.text.EntityDefinition ; import org.apache.jena.query.text.TextIndexException ; +import org.apache.jena.query.text.analyzer.ASCIIFoldingLowerCaseKeywordAnalyzer ; import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ; import org.apache.jena.rdf.model.* ; import org.apache.jena.vocabulary.RDF ; @@ -56,6 +57,7 @@ public class TestEntityMapAssembler { private static final Resource spec4; private static final Resource spec5; private static final Resource spec6; + private static final Resource spec7; private static final Resource specNoEntityField; private static final Resource specNoDefaultField; private static final Resource specNoMapProperty; @@ -119,6 +121,12 @@ private Object getOne(EntityDefinition entityDef, String field) { assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); } + @Test public void EntityHasMapEntryWithASCIIFoldingLowerCaseKeywordAnalyzer() { + EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); + EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec7, null); + assertEquals(ASCIIFoldingLowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); + } + @Test(expected=TextIndexException.class) public void errorOnNoEntityField() { EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); entDefAssem.open(null, specNoEntityField, null); @@ -254,6 +262,22 @@ private Object getOne(EntityDefinition entityDef, String field) { .addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer)) })); + // create a simple entity map specification using an ASCII folding lowercase keyword analyzer + + spec7 = model.createResource(TESTBASE + "spec7") + .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD) + .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pMap, + model.createList( + new RDFNode[] { + model.createResource() + .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE) + .addProperty(TextVocab.pAnalyzer, + model.createResource() + .addProperty(RDF.type, TextVocab.asciiFoldingLowerCaseKeywordAnalyzer)) + })); + // bad assembler spec specNoEntityField = From bc35cef743004c506e87264f1eab6c0434684438 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Sat, 31 Oct 2015 17:08:25 +0000 Subject: [PATCH 02/11] Correct one byte path. And reformat. --- .../apache/jena/atlas/io/InStreamUTF8.java | 143 ++++++++---------- 1 file changed, 65 insertions(+), 78 deletions(-) diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/InStreamUTF8.java b/jena-base/src/main/java/org/apache/jena/atlas/io/InStreamUTF8.java index acb9034ae39..5253b643e84 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/io/InStreamUTF8.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/io/InStreamUTF8.java @@ -113,31 +113,27 @@ public void closeStream() { IO.close(input) ; } @Override - public int read(char[] cbuf, int off, int len) - { + public int read(char[] cbuf, int off, int len) { // Doing this on a block of bytes may be faster. - for ( int i = off ; i < off+len ; i++ ) - { - int x = read() ; - if ( x == -1 ) - { + for ( int i = off ; i < off + len ; i++ ) { + int x = read(); + if ( x == -1 ) { if ( i == off ) - return -1 ; - return (i-off) ; + return -1; + return (i - off); } - cbuf[i] = (char)x ; + cbuf[i] = (char)x; } - return len ; + return len; } @Override - public final int read() - { - int ch = advance(input) ; - //if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ; - return ch ; - } - + public final int read() { + int ch = advance(input); + // if ( ! Character.isDefined(ch) ) throw new + // AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ; + return ch; + } /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known. * Not necessarily a valid char (this function can be used a straight UTF8 decoder @@ -147,72 +143,66 @@ public final int advance() { return advance(input) ; } /** Next codepoint */ - public static final int advance(InputStreamBuffered input) - { + public static final int advance(InputStreamBuffered input) { int x = input.advance() ; if ( x == -1 ) return -1 ; return advance(input, x) ; } /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known. - * Not necessarily a valid char (this function can be used a straight UTF8 decoder + * Not necessarily a valid char (this function can be used as a straight UTF8 decoder). */ - public static final int advance(InputStreamBuffered input, int x) - { + private static final int advance(InputStreamBuffered input, int x) { //count++ ; - // Fastpath - if ( x == -1 || x <= 127 ) - { - //count++ ; - return x ; + // ASCII Fastpath + if ( x == -1 || (x >= 0 && x <= 127) ) { + // count++ ; + return x; } // 10 => extension byte // 110..... => 2 bytes - if ( (x & 0xE0) == 0xC0 ) - { - int ch = readMultiBytes(input, x & 0x1F, 2) ; + if ( (x & 0xE0) == 0xC0 ) { + int ch = readMultiBytes(input, x & 0x1F, 2); // count += 2 ; - return ch ; - + return ch; + } - // 1110.... => 3 bytes : 16 bits : not outside 16bit chars - if ( (x & 0xF0) == 0xE0 ) - { - int ch = readMultiBytes(input, x & 0x0F, 3) ; + // 1110.... => 3 bytes : 16 bits : not outside 16bit chars + if ( (x & 0xF0) == 0xE0 ) { + int ch = readMultiBytes(input, x & 0x0F, 3); // count += 3 ; - //if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ; - return ch ; + // if ( ! Character.isDefined(ch) ) throw new + // AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) + // ; + return ch; } - // Looking like 4 byte charcater. - int ch = -2 ; + // Looking like 4 byte character. + int ch = -2; // 11110zzz => 4 bytes. - if ( (x & 0xF8) == 0xF0 ) - { - ch = readMultiBytes(input, x & 0x08, 4) ; - // Opsp - need two returns. Character.toChars(ch, chars, 0) ; - // count += 4 ; + if ( (x & 0xF8) == 0xF0 ) { + ch = readMultiBytes(input, x & 0x08, 4); + // Opps - need two returns. Character.toChars(ch, chars, 0) ; + // count += 4 ; } - - else - IO.exception(new IOException("Illegal UTF-8: "+x)) ; - // This test will go off. We're processing a 4 byte sequence but Java only supports 16 bit chars. + else + IO.exception(new IOException("Illegal UTF-8: " + x)); + + // This test will go off. We're processing a 4 byte sequence but Java + // only supports 16 bit chars. if ( ch > Character.MAX_VALUE ) - throw new AtlasException("Out of range character (must use a surrogate pair)") ; - if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ; - return ch ; + throw new AtlasException("Out of range character (must use a surrogate pair)"); + if ( !Character.isDefined(ch) ) + throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)); + return ch; } - private static int readMultiBytes(InputStreamBuffered input, int start, int len) //throws IOException - { - //System.out.print(" -("+len+")") ; p(start) ; - + private static int readMultiBytes(InputStreamBuffered input, int start, int len) { int x = start ; - for ( int i = 0 ; i < len-1 ; i++ ) - { + for ( int i = 0 ; i < len-1 ; i++ ) { int x2 = input.advance() ; if ( x2 == -1 ) throw new AtlasException("Premature end to UTF-8 sequence at end of input") ; @@ -226,28 +216,25 @@ private static int readMultiBytes(InputStreamBuffered input, int start, int len) return x ; } - private static void p(int ch) - { - System.out.printf(" %02X", ch) ; + private static void p(int ch) { + System.out.printf(" %02X", ch); if ( ch == -1 ) System.out.println(); } - - public static String decode(byte[] bytes) - { - try - { - char[] chars = new char[bytes.length] ; - InputStream in = new ByteArrayInputStream(bytes) ; - Reader r = new InStreamUTF8(in) ; - int len ; - len = r.read(chars) ; - IO.close(r) ; - return new String(chars, 0, len) ; - } catch (IOException ex) - { - IO.exception(ex) ; - return null ; + + public static String decode(byte[] bytes) { + try { + char[] chars = new char[bytes.length]; + InputStream in = new ByteArrayInputStream(bytes); + Reader r = new InStreamUTF8(in); + int len; + len = r.read(chars); + IO.close(r); + return new String(chars, 0, len); + } + catch (IOException ex) { + IO.exception(ex); + return null; } } } From 00ba425564665c6ba5742c81d1f97c790c9352fb Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Sat, 31 Oct 2015 17:08:34 +0000 Subject: [PATCH 03/11] JENA-1059: Optimize insert and delete of constant triples/quads. "Constant" means uses URIs and literals only. Remove out-of-date comment about bulk update. --- .../sparql/modify/UpdateEngineWorker.java | 93 +++++++++++-------- .../sparql/modify/TestUpdateOperations.java | 37 ++++++++ 2 files changed, 93 insertions(+), 37 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java b/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java index 333b7f3900e..b70b9ea6d88 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java @@ -20,13 +20,16 @@ import static org.apache.jena.sparql.modify.TemplateLib.template ; +import java.util.ArrayList ; import java.util.Iterator ; import java.util.List ; + import org.apache.jena.atlas.data.BagFactory ; import org.apache.jena.atlas.data.DataBag ; import org.apache.jena.atlas.data.ThresholdPolicy ; import org.apache.jena.atlas.data.ThresholdPolicyFactory ; import org.apache.jena.atlas.iterator.Iter ; +import org.apache.jena.atlas.lib.Pair ; import org.apache.jena.atlas.lib.Sink ; import org.apache.jena.atlas.web.TypedInputStream ; import org.apache.jena.graph.Graph ; @@ -503,45 +506,61 @@ protected Element elementFromQuads(List quads) return el ; } - protected void execDelete(List quads, Node dftGraph, Iterator bindings) - { - Iterator it = template(quads, dftGraph, bindings) ; + // execDelete ; execInsert + // Quads involving only IRIs and literals do not change from binding to + // binding so any inserts, rather than repeatedly if they are going to be + // done at all. Note bNodes (if legal at this point) change from template + // instantiation to instantiation. + + private static Pair, List> split(List quads) { + List constQuads = new ArrayList<>(quads.size()) ; + List templateQuads = new ArrayList<>(quads.size()) ; + quads.forEach((q)-> { + if ( constQuad(q)) + constQuads.add(q) ; + else + templateQuads.add(q) ; + }) ; + return Pair.create(constQuads, templateQuads); + } + + private static boolean constQuad(Quad quad) { + return constTerm(quad.getGraph()) && + constTerm(quad.getSubject()) && + constTerm(quad.getPredicate()) && + constTerm(quad.getObject()) ; + } + + private static boolean constTerm(Node n) { + return n.isURI() || n.isLiteral() ; + } + + protected void execDelete(List quads, Node dftGraph, Iterator bindings) { + Pair, List> p = split(quads) ; + execDelete(p.getLeft(), p.getRight(), dftGraph, bindings) ; + } + + protected void execDelete(List onceQuads, List templateQuads, Node dftGraph, Iterator bindings) { + if ( onceQuads != null && bindings.hasNext() ) + // If at least once. + onceQuads.forEach(datasetGraph::delete); + Iterator it = template(templateQuads, dftGraph, bindings) ; if ( it == null ) return ; - - while (it.hasNext()) - { - Quad q = it.next(); - datasetGraph.delete(q); - } - - - // Alternate implementation that can use the graph BulkUpdateHandler, but forces all quads into - // memory (we don't want that!). The issue is that all of the quads can be mixed up based on the - // user supplied template. If graph stores can benefit from bulk insert/delete operations, then we - // need to expose a bulk update interface on datasetGraph, not just Graph. -// MultiMap acc = MultiMap.createMapList() ; -// while (it.hasNext()) -// { -// Quad q = it.next(); -// acc.put(q.getGraph(), q.asTriple()) ; -// } -// for ( Node gn : acc.keys() ) -// { -// Collection triples = acc.get(gn) ; -// graph(datasetGraph, gn).getBulkUpdateHandler().delete(triples.iterator()) ; -// } - } - - protected void execInsert(List quads, Node dftGraph, Iterator bindings) - { - Iterator it = template(quads, dftGraph, bindings) ; + it.forEachRemaining(datasetGraph::delete) ; + } + + protected void execInsert(List quads, Node dftGraph, Iterator bindings) { + Pair, List> p = split(quads) ; + execInsert(p.getLeft(), p.getRight(), dftGraph, bindings) ; + } + + protected void execInsert(List onceQuads, List templateQuads, Node dftGraph, Iterator bindings) { + if ( onceQuads != null && bindings.hasNext() ) + // If at least once. + onceQuads.forEach((q)->addTodatasetGraph(datasetGraph, q)) ; + Iterator it = template(templateQuads, dftGraph, bindings) ; if ( it == null ) return ; - - while (it.hasNext()) - { - Quad q = it.next(); - addTodatasetGraph(datasetGraph, q); - } + it.forEachRemaining((q)->addTodatasetGraph(datasetGraph, q)) ; } // Catch all individual adds of quads (and deletes - mainly for symmetry). diff --git a/jena-arq/src/test/java/org/apache/jena/sparql/modify/TestUpdateOperations.java b/jena-arq/src/test/java/org/apache/jena/sparql/modify/TestUpdateOperations.java index ead8940b697..3f95d975439 100644 --- a/jena-arq/src/test/java/org/apache/jena/sparql/modify/TestUpdateOperations.java +++ b/jena-arq/src/test/java/org/apache/jena/sparql/modify/TestUpdateOperations.java @@ -18,6 +18,8 @@ package org.apache.jena.sparql.modify; +import java.util.concurrent.atomic.AtomicLong ; + import org.apache.jena.atlas.iterator.Iter ; import org.apache.jena.atlas.junit.BaseTest ; import org.apache.jena.graph.Node ; @@ -28,6 +30,8 @@ import org.apache.jena.rdf.model.Resource ; import org.apache.jena.sparql.core.DatasetGraph ; import org.apache.jena.sparql.core.DatasetGraphFactory ; +import org.apache.jena.sparql.core.DatasetGraphWrapper ; +import org.apache.jena.sparql.core.Quad ; import org.apache.jena.sparql.sse.SSE ; import org.apache.jena.update.* ; import org.apache.jena.vocabulary.OWL ; @@ -95,5 +99,38 @@ public void load4() { assertEquals(1, m.listStatements(anon, null, (RDFNode)null).toList().size()); assertEquals(1, m.listStatements(null, null, anon).toList().size()); } + + // Check constant and template quads + @Test public void delete_insert_where_01() { + DatasetGraph dsg0 = DatasetGraphFactory.createMem() ; + UpdateRequest req = UpdateFactory.create("INSERT DATA {

2 . 2 . 3 . }") ; + UpdateAction.execute(req, dsg0); + assertEquals(3, dsg0.getDefaultGraph().size()) ; + + AtomicLong counterIns = new AtomicLong(0) ; + AtomicLong counterDel = new AtomicLong(0) ; + DatasetGraph dsg = new DatasetGraphWrapper(dsg0) { + @Override + public void add(Quad quad) { + counterIns.incrementAndGet() ; + get().add(quad) ; + } + + @Override + public void delete(Quad quad) { + counterDel.incrementAndGet() ; + get().delete(quad) ; + } + } ; + + // WHERE clause doubles the effect. + String s = "DELETE { ?x

2 . 2 } INSERT { ?x

1 . 1 } WHERE { ?x

?o {} UNION {} }" ; + req = UpdateFactory.create(s) ; + UpdateAction.execute(req, dsg); + assertEquals(3, counterIns.get()) ; // 3 : 1 constant, 2 from template. + assertEquals(3, counterIns.get()) ; + assertEquals(3, dsg.getDefaultGraph().size()) ; + } + } From 0433e9e7e58d6f5bf6acb6561a8b12b4b1ac5d9e Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Sun, 1 Nov 2015 17:11:09 +0000 Subject: [PATCH 04/11] Port was set twice. --- .../main/java/org/apache/jena/fuseki/jetty/JettyFuseki.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/jena-fuseki2/jena-fuseki-core/src/main/java/org/apache/jena/fuseki/jetty/JettyFuseki.java b/jena-fuseki2/jena-fuseki-core/src/main/java/org/apache/jena/fuseki/jetty/JettyFuseki.java index 8b676935de1..85784b9b92f 100644 --- a/jena-fuseki2/jena-fuseki-core/src/main/java/org/apache/jena/fuseki/jetty/JettyFuseki.java +++ b/jena-fuseki2/jena-fuseki-core/src/main/java/org/apache/jena/fuseki/jetty/JettyFuseki.java @@ -281,12 +281,9 @@ private void defaultServerConfig(int port, boolean loopback) { ServerConnector connector = new ServerConnector(server, f1) ; //, f2) ; connector.setPort(port) ; - server.addConnector(connector); - if ( loopback ) connector.setHost("localhost"); - connector.setPort(port) ; serverConnector = connector ; } } From 364a9ddcd56757a181ef929d3d02937bda76baef Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Sun, 1 Nov 2015 17:11:36 +0000 Subject: [PATCH 05/11] JENA-1059: Fine tune space allocation for lists. --- .../org/apache/jena/sparql/modify/UpdateEngineWorker.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java b/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java index b70b9ea6d88..7f8e853bcd2 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/modify/UpdateEngineWorker.java @@ -506,6 +506,7 @@ protected Element elementFromQuads(List quads) return el ; } + // JENA-1059 // execDelete ; execInsert // Quads involving only IRIs and literals do not change from binding to // binding so any inserts, rather than repeatedly if they are going to be @@ -513,8 +514,11 @@ protected Element elementFromQuads(List quads) // instantiation to instantiation. private static Pair, List> split(List quads) { + // Guess size. + // Pre-size in case large (i.e. 10K+). List constQuads = new ArrayList<>(quads.size()) ; - List templateQuads = new ArrayList<>(quads.size()) ; + // ... in which case we assume the templated triples are small / non-existent. + List templateQuads = new ArrayList<>() ; quads.forEach((q)-> { if ( constQuad(q)) constQuads.add(q) ; From 71ff28be1137794e7c10cacf0b5850997a1c8438 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Sun, 1 Nov 2015 21:32:43 +0000 Subject: [PATCH 06/11] More timeout tuning. --- .../test/java/org/apache/jena/atlas/lib/TestAlarmClock.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java index 773e0fd818b..bce2a8b1a19 100644 --- a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java +++ b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java @@ -96,7 +96,7 @@ public void alarm_04() { AlarmClock alarmClock = new AlarmClock() ; alarmClock.add(callback, 10) ; alarmClock.add(callback, 20) ; - sleep(timeout(75, 300)) ; + sleep(timeout(150, 300)) ; // ping1 went off. ping2 went off. assertEquals(2, count.get()) ; alarmClock.release() ; @@ -105,9 +105,9 @@ public void alarm_04() { @Test public void alarm_05() { AlarmClock alarmClock = new AlarmClock() ; - alarmClock.add(callback, 1000) ; + alarmClock.add(callback, 10) ; alarmClock.reset(callback, 2000) ; - sleep(50) ; + sleep(timeout(100,200)) ; // The reset should have removed the callback before it happened. assertEquals(0, count.get()) ; alarmClock.release() ; From 133cf4b0407206777f85016da12f38e474bfed20 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Mon, 2 Nov 2015 11:22:52 +0000 Subject: [PATCH 07/11] JENA-1060: Introduce "Alarm" as an instance of future timeout. --- .../java/org/apache/jena/atlas/lib/Alarm.java | 35 +++++++++++++++++++ .../org/apache/jena/atlas/lib/AlarmClock.java | 24 +++++-------- .../apache/jena/atlas/lib/TestAlarmClock.java | 27 +++++++------- 3 files changed, 57 insertions(+), 29 deletions(-) create mode 100644 jena-base/src/main/java/org/apache/jena/atlas/lib/Alarm.java diff --git a/jena-base/src/main/java/org/apache/jena/atlas/lib/Alarm.java b/jena-base/src/main/java/org/apache/jena/atlas/lib/Alarm.java new file mode 100644 index 00000000000..fae5f6504e7 --- /dev/null +++ b/jena-base/src/main/java/org/apache/jena/atlas/lib/Alarm.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.atlas.lib; + +import java.util.concurrent.ScheduledFuture ; + +/** An Alarm set by AlarmClock */ +public class Alarm { + public final Runnable task ; + public final ScheduledFuture future ; + /*package*/ final AlarmClock alarmClock; + + Alarm(AlarmClock alarmClock, Runnable task, ScheduledFuture future) { + super(); + this.alarmClock = alarmClock ; + this.task = task; + this.future = future; + } +} \ No newline at end of file diff --git a/jena-base/src/main/java/org/apache/jena/atlas/lib/AlarmClock.java b/jena-base/src/main/java/org/apache/jena/atlas/lib/AlarmClock.java index 6210bf14335..8272fe024ff 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/lib/AlarmClock.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/lib/AlarmClock.java @@ -18,8 +18,7 @@ package org.apache.jena.atlas.lib ; -import java.util.concurrent.ScheduledThreadPoolExecutor ; -import java.util.concurrent.TimeUnit ; +import java.util.concurrent.* ; /** * An AlarmClock is an object that will make a callback (with a value) at a @@ -39,29 +38,24 @@ static public AlarmClock get() { } /** Add a task to be called after a delay (in milliseconds) */ - public void add(Runnable task, long delay) { + public Alarm add(Runnable task, long delay) { if ( task == null ) throw new IllegalArgumentException("Task is null") ; - timer.schedule(task, delay, TimeUnit.MILLISECONDS) ; + ScheduledFuture future = timer.schedule(task, delay, TimeUnit.MILLISECONDS) ; + return new Alarm(this, task, future) ; } /** Reschedule a task to now run after a different delay from now (in milliseconds) */ - public void reset(Runnable task, long delay) { - if ( task == null ) - throw new IllegalArgumentException("Task is null") ; - cancel(task) ; - add(task, delay) ; + public Alarm reset(Alarm alarm, long delay) { + cancel(alarm) ; + return add(alarm.task, delay) ; } /** Cancel a task */ - public void cancel(Runnable task) { - if ( task == null ) - throw new IllegalArgumentException("Task is null") ; - timer.remove(task) ; + public void cancel(Alarm alarm) { + alarm.future.cancel(false) ; } - // public int getCount() { return timer.getQueue().size(); } - /** Clean up */ public void release() { timer.shutdownNow() ; diff --git a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java index bce2a8b1a19..8cee5f21016 100644 --- a/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java +++ b/jena-base/src/test/java/org/apache/jena/atlas/lib/TestAlarmClock.java @@ -61,8 +61,8 @@ private int timeout(int time1, int time2) { public void alarm_01() { AlarmClock alarmClock = new AlarmClock() ; // Very long - never happens. - alarmClock.add(callback, 10000000) ; - alarmClock.cancel(callback) ; + Alarm a = alarmClock.add(callback, 10000000) ; + alarmClock.cancel(a) ; assertEquals(0, count.get()) ; alarmClock.release() ; } @@ -71,31 +71,31 @@ public void alarm_01() { public void alarm_02() { AlarmClock alarmClock = new AlarmClock() ; // Short - happens. - alarmClock.add(callback, 10) ; + Alarm a = alarmClock.add(callback, 10) ; sleep(timeout(100, 250)) ; assertEquals(1, count.get()) ; // try to cancel anyway. - alarmClock.cancel(callback) ; + alarmClock.cancel(a) ; alarmClock.release() ; } @Test public void alarm_03() { AlarmClock alarmClock = new AlarmClock() ; - alarmClock.add(callback, 10) ; - alarmClock.add(callback, 1000000) ; + Alarm a1 = alarmClock.add(callback, 10) ; + Alarm a2 = alarmClock.add(callback, 1000000) ; sleep(timeout(100, 300)) ; // ping1 went off. assertEquals(1, count.get()) ; - alarmClock.cancel(callback) ; + alarmClock.cancel(a2) ; alarmClock.release() ; } @Test public void alarm_04() { AlarmClock alarmClock = new AlarmClock() ; - alarmClock.add(callback, 10) ; - alarmClock.add(callback, 20) ; + Alarm a1 = alarmClock.add(callback, 10) ; + Alarm a2 = alarmClock.add(callback, 20) ; sleep(timeout(150, 300)) ; // ping1 went off. ping2 went off. assertEquals(2, count.get()) ; @@ -105,11 +105,10 @@ public void alarm_04() { @Test public void alarm_05() { AlarmClock alarmClock = new AlarmClock() ; - alarmClock.add(callback, 10) ; - alarmClock.reset(callback, 2000) ; - sleep(timeout(100,200)) ; - // The reset should have removed the callback before it happened. - assertEquals(0, count.get()) ; + Alarm a = alarmClock.add(callback, 50) ; + alarmClock.reset(a, 20000) ; + sleep(timeout(100, 250)) ; + alarmClock.cancel(a); alarmClock.release() ; } } From 0df098ba0c24581c49ab187950f3c5bfcdf1b6a0 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Mon, 2 Nov 2015 12:00:29 +0000 Subject: [PATCH 08/11] JENA-1060: Use "Alarm" instances. --- .../sparql/engine/QueryExecutionBase.java | 72 ++++++++++--------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/engine/QueryExecutionBase.java b/jena-arq/src/main/java/org/apache/jena/sparql/engine/QueryExecutionBase.java index c72cd18244d..c679bdfbc34 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/engine/QueryExecutionBase.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/engine/QueryExecutionBase.java @@ -24,6 +24,7 @@ import java.util.Set; import java.util.concurrent.TimeUnit; +import org.apache.jena.atlas.lib.Alarm ; import org.apache.jena.atlas.lib.AlarmClock; import org.apache.jena.atlas.logging.Log; import org.apache.jena.graph.Node; @@ -57,20 +58,20 @@ public class QueryExecutionBase implements QueryExecution // Initial bindings. // Split : QueryExecutionGraph already has the dataset. - private Query query ; - private Dataset dataset ; - private QueryEngineFactory qeFactory ; - private QueryIterator queryIterator = null ; - private Plan plan = null ; - private Context context ; - private QuerySolution initialBinding = null ; - - // Set if QueryIterator.cancel has been called - private volatile boolean isCancelled = false ; - private boolean closed ; - private volatile TimeoutCallback expectedCallback = null ; - private TimeoutCallback timeout1Callback = null ; - private TimeoutCallback timeout2Callback = null ; + private Query query; + private Dataset dataset; + private QueryEngineFactory qeFactory; + private QueryIterator queryIterator = null; + private Plan plan = null; + private Context context; + private QuerySolution initialBinding = null; + + // Set if QueryIterator.cancel has been called + private volatile boolean isCancelled = false; + private boolean closed; + private volatile TimeoutCallback expectedCallback = null; + private Alarm timeout1Alarm = null; + private Alarm timeout2Alarm = null; private final Object lockTimeout = new Object() ; // synchronization. private static final long TIMEOUT_UNSET = -1 ; @@ -142,10 +143,10 @@ public void close() queryIterator.close() ; if ( plan != null ) plan.close() ; - if ( timeout1Callback != null ) - alarmClock.cancel(timeout1Callback) ; - if ( timeout2Callback != null ) - alarmClock.cancel(timeout2Callback) ; + if ( timeout1Alarm != null ) + alarmClock.cancel(timeout1Alarm) ; + if ( timeout2Alarm != null ) + alarmClock.cancel(timeout2Alarm) ; } @Override @@ -461,8 +462,9 @@ protected Binding moveToNextBinding() // So nearly not needed. synchronized(lockTimeout) { - expectedCallback = timeout2Callback ; - // Lock against calls of .abort() nor of timeout1Callback. + TimeoutCallback callback = new TimeoutCallback() ; + expectedCallback = callback ; + // Lock against calls of .abort() or of timeout1Callback. // Update/check the volatiles in a careful order. // This cause timeout1 not to call .abort and hence not set isCancelled @@ -473,14 +475,16 @@ protected Binding moveToNextBinding() // timeout1 went off after the binding was yielded but // before we got here. throw new QueryCancelledException() ; - if ( timeout1Callback != null ) - alarmClock.cancel(timeout1Callback) ; - timeout1Callback = null ; + if ( timeout1Alarm != null ) { + alarmClock.cancel(timeout1Alarm) ; + timeout1Alarm = null ; + } // Now arm the second timeout, if any. - if ( timeout2 > 0 ) + if ( timeout2 > 0 ) { // Not first timeout - finite second timeout. - alarmClock.add(timeout2Callback, timeout2) ; + timeout2Alarm = alarmClock.add(callback, timeout2) ; + } resetDone = true ; } } @@ -526,9 +530,9 @@ private void startQueryIterator() if ( ! isTimeoutSet(timeout1) && isTimeoutSet(timeout2) ) { // Single overall timeout. - timeout2Callback = new TimeoutCallback() ; - expectedCallback = timeout2Callback ; - alarmClock.add(timeout2Callback, timeout2) ; + TimeoutCallback callback = new TimeoutCallback() ; + expectedCallback = callback ; + timeout2Alarm = alarmClock.add(callback, timeout2) ; // Start the query. queryIterator = getPlan().iterator() ; // But don't add resetter. @@ -536,19 +540,19 @@ private void startQueryIterator() } // Case isTimeoutSet(timeout1) + // Whether timeout2 is set is determined by QueryIteratorTimer2 + // Subcase 2: ! isTimeoutSet(timeout2) // Add timeout to first row. - timeout1Callback = new TimeoutCallback() ; - alarmClock.add(timeout1Callback, timeout1) ; - expectedCallback = timeout1Callback ; + TimeoutCallback callback = new TimeoutCallback() ; + timeout1Alarm = alarmClock.add(callback, timeout1) ; + expectedCallback = callback ; // We don't know if getPlan().iterator() does a lot of work or not // (ideally it shouldn't start executing the query but in some sub-systems // it might be necessary) queryIterator = getPlan().iterator() ; - // Add the timeout resetter wrapper. - timeout2Callback = new TimeoutCallback() ; - // Wrap with a resetter. + // Add the timeout1 resetter wrapper. queryIterator = new QueryIteratorTimer2(queryIterator) ; // Minor optimization - the first call of hasNext() or next() will From b0d1a9dc998e229595a3560096f6a0e404346250 Mon Sep 17 00:00:00 2001 From: Rob Vesse Date: Tue, 3 Nov 2015 03:37:41 +0000 Subject: [PATCH 09/11] Further fix for JENA-1003 - Avoid checking temporary directory if unknown - Remove PipeViewer (pv) support as it has proven to be horribly broken under many environments --- apache-jena/bin/tdbloader2index | 48 +++++---------------------------- 1 file changed, 7 insertions(+), 41 deletions(-) diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index index 89296aae3ab..eed2edc29ee 100755 --- a/apache-jena/bin/tdbloader2index +++ b/apache-jena/bin/tdbloader2index @@ -266,23 +266,6 @@ debug "Jena Classpath is $JENA_CP" # All files are written S P O / G S P O columns per row but in different sort orders. info "Index Building Phase" -# Check whether Pipe Viewer is available -# Needs to temporarily disable exit on error as which produces an error -# if the given command is not found -#set +e -#which pv >/dev/null 2>&1 -#HAS_PV=$? -#set -e -#if [ $HAS_PV = 0 ]; then -# debug "pv (Pipe Viewer) available on your system so sorts will show progres" -#else -# debug "No pv (Pipe Viewer) on your system so sorts will show no progress" -#fi - -# TODO Pipe Viewer causes hangs in some terminals so currently disabled by default -# May be manually enabled by setting HAS_PV to zero in your environment -HAS_PV=${HAS_PV:-1} - # Check where we are storing temporary sort files debug "Sort Arguments: $SORT_ARGS" SORT_TEMP_DIR= @@ -355,10 +338,12 @@ generate_index() debug "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}" # Check for disk space on temporary disk - SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}")) - if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then - if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then - warn "There may be insufficient for sort to perform an external sort using Tempo Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)" + if [ -n "${SORT_TEMP_DIR}" ]; then + SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}")) + if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then + if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then + warn "There may be insufficient for sort to perform an external sort using Temp Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)" + fi fi fi else @@ -371,26 +356,7 @@ generate_index() # Sort the input data info "Sort $IDX" debug "Sorting $DATA into work file $WORK" - if [ $HAS_PV = 0 ]; then - # Use pv (pipe viewer) to monitor sort progress - # Note that progress data will only be seen if running in the foreground - - pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK - - # CAUTION - # If sort errors here then the piping through pv will stop us from seeing the error - # and we'll continue onwards - # Therefore we need to check that the output size is same as input size as this is - # the only way to tell if sort suceeded - local OUTPUT_SIZE=$(getSize "$WORK") - debug "Size of sorted data is $OUTPUT_SIZE bytes" - if [ $SIZE != $OUTPUT_SIZE ]; then - abort 1 "Aborting due to sort error, see preceding output for error from sort" - fi - else - # Use sort without any progress monitoring - sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK - fi + sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK info "Sort $IDX Completed" # Build into an index From be12606cab9b7107d877635a75dc0069f7689e30 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Mon, 2 Nov 2015 22:00:31 +0000 Subject: [PATCH 10/11] Fix: Spec says ENCODE_FOR_URI should work with lang tag strings. --- .../org/apache/jena/sparql/expr/nodevalue/XSDFuncOp.java | 7 +++---- .../java/org/apache/jena/sparql/expr/TestFunctions2.java | 4 +++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/sparql/expr/nodevalue/XSDFuncOp.java b/jena-arq/src/main/java/org/apache/jena/sparql/expr/nodevalue/XSDFuncOp.java index 57a2497ecae..6b2ea505876 100644 --- a/jena-arq/src/main/java/org/apache/jena/sparql/expr/nodevalue/XSDFuncOp.java +++ b/jena-arq/src/main/java/org/apache/jena/sparql/expr/nodevalue/XSDFuncOp.java @@ -52,6 +52,7 @@ import org.apache.jena.datatypes.xsd.XSDDateTime ; import org.apache.jena.graph.Node ; import org.apache.jena.graph.NodeFactory ; +import org.apache.jena.rdf.model.impl.Util ; import org.apache.jena.sparql.ARQInternalErrorException ; import org.apache.jena.sparql.SystemARQ ; import org.apache.jena.sparql.expr.* ; @@ -583,10 +584,8 @@ public static NodeValue strEncodeForURI(NodeValue v) { Node n = v.asNode() ; if ( !n.isLiteral() ) throw new ExprEvalException("Not a literal") ; - if ( n.getLiteralDatatype() != null ) { - if ( !n.getLiteralDatatype().equals(XSDDatatype.XSDstring) ) - throw new ExprEvalException("Not a string literal") ; - } + if ( ! Util.isSimpleString(n) && ! Util.isLangString(n) ) + throw new ExprEvalException("Not a string literal") ; String str = n.getLiteralLexicalForm() ; String encStr = IRILib.encodeUriComponent(str) ; diff --git a/jena-arq/src/test/java/org/apache/jena/sparql/expr/TestFunctions2.java b/jena-arq/src/test/java/org/apache/jena/sparql/expr/TestFunctions2.java index 7b8c04d4102..10e174762ab 100644 --- a/jena-arq/src/test/java/org/apache/jena/sparql/expr/TestFunctions2.java +++ b/jena-arq/src/test/java/org/apache/jena/sparql/expr/TestFunctions2.java @@ -188,9 +188,11 @@ public class TestFunctions2 extends BaseTest @Test public void encodeURI_01() { test("encode_for_uri('a:b cd/~')", "'a%3Ab%20cd%2F~'") ; } @Test public void encodeURI_02() { test("encode_for_uri('\\n')", "'%0A'") ; } @Test public void encodeURI_03() { test("encode_for_uri('\\t')", "'%09'") ; } + @Test public void encodeURI_04() { test("encode_for_uri('abc')", "'abc'") ; } + @Test public void encodeURI_05() { test("encode_for_uri('abc'@en)", "'abc'") ; } @Test(expected=ExprEvalException.class) - public void encodeURI_04() { test("encode_for_uri(1234)", "'1234'") ; } + public void encodeURI_09() { test("encode_for_uri(1234)", "'1234'") ; } /* Compatibility rules # pairs of simple literals, From b0af9b111fc6fcb7d922a668dc1ed7209bac5bc5 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 29 Oct 2015 17:08:54 +0200 Subject: [PATCH 11/11] first implementation of ASCIIFoldingLowerCaseKeywordAnalyzer --- .../ASCIIFoldingLowerCaseKeywordAnalyzer.java | 51 ++++++++++++++++ ...dingLowerCaseKeywordAnalyzerAssembler.java | 47 +++++++++++++++ .../query/text/assembler/TextAssembler.java | 1 + .../jena/query/text/assembler/TextVocab.java | 1 + .../org/apache/jena/query/text/TS_Text.java | 1 + ...hASCIIFoldingLowerCaseKeywordAnalyzer.java | 59 +++++++++++++++++++ .../assembler/TestEntityMapAssembler.java | 24 ++++++++ 7 files changed, 184 insertions(+) create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java create mode 100644 jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java create mode 100644 jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java new file mode 100644 index 00000000000..f76b3b21ca4 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ASCIIFoldingLowerCaseKeywordAnalyzer.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.analyzer ; + +import java.io.Reader ; + +import org.apache.lucene.analysis.Analyzer ; +import org.apache.lucene.analysis.core.KeywordTokenizer ; +import org.apache.lucene.analysis.core.LowerCaseFilter ; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter ; +import org.apache.lucene.util.Version ; + + +/** + * Lucene Analyzer implementation that works like KeywordAnalyzer (i.e. + * doesn't tokenize the input, keeps it as a single token), but forces text + * to lowercase and is thus case-insensitive. + */ + +public class ASCIIFoldingLowerCaseKeywordAnalyzer extends Analyzer { + private Version version; + + public ASCIIFoldingLowerCaseKeywordAnalyzer(Version ver) { + this.version = ver; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + KeywordTokenizer source = new KeywordTokenizer(reader); + LowerCaseFilter filter1 = new LowerCaseFilter(version, source); + ASCIIFoldingFilter filter2 = new ASCIIFoldingFilter(filter1); + return new TokenStreamComponents(source, filter2); + } + +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java new file mode 100644 index 00000000000..cc4ed9d88a2 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ASCIIFoldingLowerCaseKeywordAnalyzerAssembler.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler ; + +import org.apache.jena.assembler.Assembler ; +import org.apache.jena.assembler.Mode ; +import org.apache.jena.assembler.assemblers.AssemblerBase ; +import org.apache.jena.query.text.TextIndexLucene; +import org.apache.jena.query.text.analyzer.ASCIIFoldingLowerCaseKeywordAnalyzer; +import org.apache.jena.rdf.model.Resource ; +import org.apache.lucene.analysis.Analyzer; + +/** + * Assembler to create lowercase keyword analyzers. + */ +public class ASCIIFoldingLowerCaseKeywordAnalyzerAssembler extends AssemblerBase { + /* + text:map ( + [ text:field "text" ; + text:predicate rdfs:label; + text:analyzer [ + a lucene:LowerCaseKeywordAnalyzer ; ] + ] + . + */ + + @Override + public Analyzer open(Assembler a, Resource root, Mode mode) { + return new ASCIIFoldingLowerCaseKeywordAnalyzer(TextIndexLucene.VER); + } +} diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java index 021c0030713..4bcdc57019b 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java @@ -34,6 +34,7 @@ public static void init() Assembler.general.implementWith(TextVocab.simpleAnalyzer, new SimpleAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ; + Assembler.general.implementWith(TextVocab.asciiFoldingLowerCaseKeywordAnalyzer, new ASCIIFoldingLowerCaseKeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ; } } diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java index fb14505e833..e108d63c3de 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java @@ -63,6 +63,7 @@ public class TextVocab public static final Resource simpleAnalyzer = Vocab.resource(NS, "SimpleAnalyzer"); public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer"); public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer"); + public static final Resource asciiFoldingLowerCaseKeywordAnalyzer = Vocab.resource(NS, "ASCIIFoldingLowerCaseKeywordAnalyzer"); public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer"); } diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java index 3459e4333a8..cfdbcd2fafb 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java @@ -48,6 +48,7 @@ , TestDatasetWithStandardAnalyzer.class , TestDatasetWithKeywordAnalyzer.class , TestDatasetWithLowerCaseKeywordAnalyzer.class + , TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.class , TestLuceneWithMultipleThreads.class , TestDatasetWithLocalizedAnalyzer.class }) diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java new file mode 100644 index 00000000000..d76e80a6ff2 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import java.util.Arrays ; +import java.util.HashSet ; +import java.util.Set ; + +import org.apache.jena.atlas.lib.StrUtils ; +import org.junit.Before ; +import org.junit.Test ; + +/** + * This class defines a setup configuration for a dataset that uses an ASCII folding lowercase keyword analyzer with a Lucene index. + */ +public class TestDatasetWithASCIIFoldingLowerCaseKeywordAnalyzer extends TestDatasetWithLowerCaseKeywordAnalyzer { + @Override + @Before + public void before() { + init("text:ASCIIFoldingLowerCaseKeywordAnalyzer"); + } + + @Test + public void testASCIIFoldingLowerCaseKeywordAnalyzerIsCaseAndAccentInsensitive() { + final String testName = "testASCIIFoldingLowerCaseKeywordAnalyzerIsCaseAndAccentInsensitive"; + final String turtle = StrUtils.strjoinNL( + TURTLE_PROLOG, + "<" + RESOURCE_BASE + testName + ">", + " rdfs:label 'Feeling a déjà vu'", + "." + ); + String queryString = StrUtils.strjoinNL( + QUERY_PROLOG, + "SELECT ?s", + "WHERE {", + " ?s text:query ( rdfs:label '\"feeling ä déja\"*' 10 ) .", + "}" + ); + Set expectedURIs = new HashSet<>() ; + expectedURIs.addAll( Arrays.asList(RESOURCE_BASE + testName)) ; + doTestSearch(turtle, queryString, expectedURIs); + } +} diff --git a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java index ab3ed299953..3e6d9b08f62 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java @@ -29,6 +29,7 @@ import org.apache.jena.graph.Node ; import org.apache.jena.query.text.EntityDefinition ; import org.apache.jena.query.text.TextIndexException ; +import org.apache.jena.query.text.analyzer.ASCIIFoldingLowerCaseKeywordAnalyzer ; import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ; import org.apache.jena.rdf.model.* ; import org.apache.jena.vocabulary.RDF ; @@ -56,6 +57,7 @@ public class TestEntityMapAssembler { private static final Resource spec4; private static final Resource spec5; private static final Resource spec6; + private static final Resource spec7; private static final Resource specNoEntityField; private static final Resource specNoDefaultField; private static final Resource specNoMapProperty; @@ -119,6 +121,12 @@ private Object getOne(EntityDefinition entityDef, String field) { assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); } + @Test public void EntityHasMapEntryWithASCIIFoldingLowerCaseKeywordAnalyzer() { + EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); + EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec7, null); + assertEquals(ASCIIFoldingLowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass()); + } + @Test(expected=TextIndexException.class) public void errorOnNoEntityField() { EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler(); entDefAssem.open(null, specNoEntityField, null); @@ -254,6 +262,22 @@ private Object getOne(EntityDefinition entityDef, String field) { .addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer)) })); + // create a simple entity map specification using an ASCII folding lowercase keyword analyzer + + spec7 = model.createResource(TESTBASE + "spec7") + .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD) + .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pMap, + model.createList( + new RDFNode[] { + model.createResource() + .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD) + .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE) + .addProperty(TextVocab.pAnalyzer, + model.createResource() + .addProperty(RDF.type, TextVocab.asciiFoldingLowerCaseKeywordAnalyzer)) + })); + // bad assembler spec specNoEntityField =