From 4d130f509b18eea6136af2c8820977e0877e9733 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Wed, 26 Oct 2016 20:42:18 +0100 Subject: [PATCH 1/3] Clean tokenizer; support for warnings (and continue) --- .../apache/jena/riot/lang/LangNTriples.java | 4 - .../riot/system/ParserProfileChecker.java | 6 + .../jena/riot/tokens/TokenizerText.java | 146 +++++++++++------- 3 files changed, 99 insertions(+), 57 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java index 374bf07c6ad..535c3f863cc 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java @@ -84,10 +84,6 @@ protected final Triple parseOne() if ( x.getType() != TokenType.DOT ) exception(x, "Triple not terminated by DOT: %s", x) ; -// Node s = X ; -// Node p = X ; -// Node o = X ; -// return T ; Node s = tokenAsNode(sToken) ; Node p = tokenAsNode(pToken) ; diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java index aa336490769..a748aa8c2f4 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileChecker.java @@ -61,6 +61,12 @@ public String resolveIRI(String uriStr, long line, long col) { public IRI makeIRI(String uriStr, long line, long col) { // resolves, but we handle the errors and warnings. IRI iri = prologue.getResolver().resolveSilent(uriStr) ; + if ( uriStr.contains(" ") ) { + // Specific check for spaces. + errorHandler.warning("Bad IRI: <"+uriStr+"> Spaces are not legal in URIs/IRIs.", line, col); + return iri ; + } + // At this point, IRI "errors" are warnings. CheckerIRI.iriViolations(iri, errorHandler, line, col) ; return iri ; } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java index 6537d061e06..eb6e7075b67 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java @@ -18,8 +18,8 @@ package org.apache.jena.riot.tokens; -import static org.apache.jena.atlas.lib.Chars.* ; -import static org.apache.jena.riot.system.RiotChars.* ; +import static org.apache.jena.atlas.lib.Chars.*; +import static org.apache.jena.riot.system.RiotChars.*; import java.util.NoSuchElementException ; @@ -28,6 +28,7 @@ import org.apache.jena.atlas.io.PeekReader ; import org.apache.jena.atlas.lib.Chars ; import org.apache.jena.riot.RiotParseException ; +import org.apache.jena.riot.system.ErrorHandler; import org.apache.jena.riot.system.RiotChars ; import org.apache.jena.sparql.ARQInternalErrorException ; @@ -60,6 +61,26 @@ public final class TokenizerText implements Tokenizer private boolean finished = false ; private TokenChecker checker = null ; + // The code assumes that errors throw exception and so stop parsing. + private ErrorHandler errorHandler = new ErrorHandler() { + @Override + public void warning(String message, long line, long col) { + // Warning/continue. + //ErrorHandlerFactory.errorHandlerStd.warning(message, line, col); + throw new RiotParseException(message, line, col) ; + } + + @Override + public void error(String message, long line, long col) { + throw new RiotParseException(message, line, col) ; + } + + @Override + public void fatal(String message, long line, long col) { + throw new RiotParseException(message, line, col) ; + } + } ; + /*package*/ TokenizerText(PeekReader reader) { this(reader, false) ; } @@ -124,11 +145,25 @@ public final Token peek() { } @Override - public void remove() { throw new UnsupportedOperationException() ; } + public void remove() + { throw new UnsupportedOperationException() ; } - public TokenChecker getChecker() { return checker ; } - public void setChecker(TokenChecker checker) { this.checker = checker ; } + public TokenChecker getChecker() { + return checker; + } + + public void setChecker(TokenChecker checker) { + this.checker = checker; + } + + public ErrorHandler getErrorHandler() { + return errorHandler; + } + public void setErrorHandler(ErrorHandler handler) { + this.errorHandler = handler; + } + @Override public void close() { IO.close(reader) ; @@ -243,7 +278,7 @@ private Token parseToken() { Token subToken = parseToken() ; if ( !subToken.isIRI() ) - exception("Datatype URI required after ^^ - URI or prefixed name expected") ; + error("Datatype URI required after ^^ - URI or prefixed name expected") ; mainToken.setSubToken2(subToken) ; mainToken.setType(TokenType.LITERAL_DT) ; @@ -275,7 +310,7 @@ private Token parseToken() { token.setType(TokenType.CNTRL) ; ch = reader.readChar() ; if ( ch == EOF ) - exception("EOF found after " + CTRL_CHAR) ; + error("EOF found after " + CTRL_CHAR) ; if ( RiotChars.isWhitespace(ch) ) token.cntrlCode = -1 ; else @@ -428,6 +463,8 @@ private Token parseToken() { private static final boolean VeryVeryLaxIRI = false ; + // Spaces in IRI are illegal. + private static final boolean AllowSpacesInIRI = false ; // [8] IRIREF ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' private String readIRI() { @@ -436,11 +473,11 @@ private String readIRI() { int ch = reader.readChar() ; switch(ch) { case EOF: - exception("Broken IRI (End of file)") ; + error("Broken IRI (End of file)") ; case NL: - exception("Broken IRI (newline): %s", stringBuilder.toString()) ; + error("Broken IRI (newline): %s", stringBuilder.toString()) ; case CR: - exception("Broken IRI (CR): %s", stringBuilder.toString()) ; + error("Broken IRI (CR): %s", stringBuilder.toString()) ; case CH_GT: // Done! return stringBuilder.toString() ; @@ -457,17 +494,20 @@ private String readIRI() { break ; case CH_LT: // Probably a corrupt file so not a warning. - exception("Bad character in IRI (bad character: '<'): <%s<...>", stringBuilder.toString()) ; + error("Bad character in IRI (bad character: '<'): <%s[<]...>", stringBuilder.toString()) ; case TAB: - exception("Bad character in IRI (Tab character): <%s[tab]...>", stringBuilder.toString()) ; - case SPC: - warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString()) ; + error("Bad character in IRI (Tab character): <%s[tab]...>", stringBuilder.toString()) ; case '{': case '}': case '"': case '|': case '^': case '`' : if ( ! VeryVeryLaxIRI ) warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch) ; + break ; + case SPC: + if ( ! AllowSpacesInIRI ) + warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString()) ; + break ; default: if ( ch <= 0x19 ) - warning("Illegal character in IRI (control char 0x%02X): %s", ch, stringBuilder.toString()) ; + warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString()) ; } insertCodepoint(stringBuilder, ch) ; } @@ -477,13 +517,13 @@ private String readIRI() { private final int readUnicodeEscape() { int ch = reader.readChar() ; if ( ch == EOF ) - exception("Broken escape sequence") ; + error("Broken escape sequence") ; switch (ch) { case 'u': return readUnicode4Escape(); case 'U': return readUnicode8Escape(); default: - exception("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch); + error("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch); } return 0 ; } @@ -506,7 +546,7 @@ private void readPrefixedNameOrKeyword(Token token) { // If we made no progress, nothing found, not even a keyword -- it's an // error. if ( posn == reader.getPosition() ) - exception("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch) ; + error("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch) ; if ( Checking ) checkKeyword(token.getImage()) ; @@ -629,13 +669,13 @@ private void processPLX(int ch) ch = reader.peekChar() ; if ( ! isHexChar(ch) ) - exception("Not a hex charcater: '%c'",ch) ; + error("Not a hex charcater: '%c'",ch) ; stringBuilder.append((char)ch) ; reader.readChar() ; ch = reader.peekChar() ; if ( ! isHexChar(ch) ) - exception("Not a hex charcater: '%c'",ch) ; + error("Not a hex charcater: '%c'",ch) ; stringBuilder.append((char)ch) ; reader.readChar() ; } @@ -661,11 +701,11 @@ private String readString(int startCh, int endCh) { int ch = reader.readChar() ; if ( ch == EOF ) { // if ( endNL ) return stringBuilder.toString() ; - exception("Broken token: " + stringBuilder.toString(), y, x) ; + error("Broken token: " + stringBuilder.toString(), y, x) ; } if ( ch == NL ) - exception("Broken token (newline): " + stringBuilder.toString(), y, x) ; + error("Broken token (newline): " + stringBuilder.toString(), y, x) ; if ( ch == endCh ) { return stringBuilder.toString() ; @@ -684,7 +724,7 @@ private String readLongString(int quoteChar, boolean endNL) { if ( ch == EOF ) { if ( endNL ) return stringBuilder.toString() ; - exception("Broken long string") ; + error("Broken long string") ; } if ( ch == quoteChar ) { @@ -771,14 +811,14 @@ private String readBlankNodeLabel() { { int ch = reader.peekChar() ; if ( ch == EOF ) - exception("Blank node label missing (EOF found)") ; + error("Blank node label missing (EOF found)") ; if ( isWhitespace(ch) ) - exception("Blank node label missing") ; + error("Blank node label missing") ; // if ( ! isAlpha(ch) && ch != '_' ) // Not strict if ( !RiotChars.isPNChars_U_N(ch) ) - exception("Blank node label does not start with alphabetic or _ :" + (char)ch) ; + error("Blank node label does not start with alphabetic or _ :" + (char)ch) ; reader.readChar() ; stringBuilder.append((char)ch) ; } @@ -879,7 +919,7 @@ private void readNumber() { if ( x == 0 && !isDecimal ) // Possible a tokenizer error - should not have entered readNumber // in the first place. - exception("Unrecognized as number") ; + error("Unrecognized as number") ; if ( exponent(stringBuilder) ) { isDouble = true ; @@ -905,7 +945,7 @@ else if ( isDecimal ) token.setType(TokenType.INTEGER) ; } - private static void readHex(PeekReader reader, StringBuilder sb) { + private void readHex(PeekReader reader, StringBuilder sb) { // Just after the 0x, which are in sb int x = 0 ; for (;;) { @@ -918,7 +958,7 @@ private static void readHex(PeekReader reader, StringBuilder sb) { x++ ; } if ( x == 0 ) - exception(reader, "No hex characters after " + sb.toString()) ; + error("No hex characters after " + sb.toString()) ; } private int readDigits(StringBuilder buffer) { @@ -976,7 +1016,7 @@ private boolean exponent(StringBuilder sb) { readPossibleSign(sb) ; int x = readDigits(sb) ; if ( x == 0 ) - exception("Malformed double: " + sb) ; + error("Malformed double: " + sb) ; return true ; } @@ -984,7 +1024,7 @@ private String langTag() { stringBuilder.setLength(0) ; a2z(stringBuilder) ; if ( stringBuilder.length() == 0 ) - exception("Bad language tag") ; + error("Bad language tag") ; for (;;) { int ch = reader.peekChar() ; if ( ch == '-' ) { @@ -993,7 +1033,7 @@ private String langTag() { int x = stringBuilder.length() ; a2zN(stringBuilder) ; if ( stringBuilder.length() == x ) - exception("Bad language tag") ; + error("Bad language tag") ; } else break ; } @@ -1030,7 +1070,7 @@ private void insertCodepoint(StringBuilder buffer, int ch) { // Convert to UTF-16. Note that the rest of any system this is used // in must also respect codepoints and surrogate pairs. if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) ) - exception("Illegal codepoint: 0x%04X", ch) ; + error("Illegal codepoint: 0x%04X", ch) ; char[] chars = Character.toChars(ch) ; buffer.append(chars) ; } @@ -1108,7 +1148,7 @@ private void checkControl(int code) { private final int readLiteralEscape() { int c = reader.readChar() ; if ( c == EOF ) - exception("Escape sequence not completed") ; + error("Escape sequence not completed") ; switch (c) { case 'n': return NL ; @@ -1122,7 +1162,7 @@ private final int readLiteralEscape() { case 'u': return readUnicode4Escape(); case 'U': return readUnicode8Escape(); default: - exception("Illegal escape sequence value: %c (0x%02X)", c, c); + error("Illegal escape sequence value: %c (0x%02X)", c, c); return 0 ; } } @@ -1134,7 +1174,7 @@ private final int readCharEscape() { int c = reader.readChar() ; if ( c == EOF ) - exception("Escape sequence not completed") ; + error("Escape sequence not completed") ; switch (c) { case '_': case '~': case '.': case '-': case '!': case '$': case '&': @@ -1143,7 +1183,7 @@ private final int readCharEscape() { case '=': case '/': case '?': case '#': case '@': case '%': return c ; default: - exception("illegal character escape value: \\%c", c); + error("illegal character escape value: \\%c", c); return 0 ; } } @@ -1154,7 +1194,7 @@ private final int readCharEscape() { private final int readUnicode8Escape() { int ch8 = readHexSequence(8) ; if ( ch8 > Character.MAX_CODE_POINT ) - exception("Illegal code point in \\U sequence value: 0x%08X", ch8) ; + error("Illegal code point in \\U sequence value: 0x%08X", ch8) ; return ch8 ; } @@ -1172,12 +1212,12 @@ private final int readHexSequence(int N) { private final int readHexChar() { int ch = reader.readChar() ; if ( ch == EOF ) - exception("Not a hexadecimal character (end of file)") ; + error("Not a hexadecimal character (end of file)") ; int x = valHexChar(ch) ; if ( x != -1 ) return x ; - exception("Not a hexadecimal character: " + (char)ch) ; + error("Not a hexadecimal character: " + (char)ch) ; return -1 ; } @@ -1185,13 +1225,13 @@ private boolean expect(String str) { for (int i = 0; i < str.length(); i++) { char want = str.charAt(i) ; if ( reader.eof() ) { - exception("End of input during expected string: " + str) ; + error("End of input during expected string: " + str) ; return false ; } int inChar = reader.peekChar() ; if ( inChar != want ) { // System.err.println("N-triple reader error"); - exception("expected \"" + str + "\"") ; + error("expected \"" + str + "\"") ; return false ; } reader.readChar() ; @@ -1200,18 +1240,18 @@ private boolean expect(String str) { } private void warning(String message, Object... args) { - exception(message, args); + String msg = String.format(message, args) ; + errorHandler.warning(msg, reader.getLineNum(), reader.getColNum()) ; + //exception(message, args); } - private void exception(String message, Object... args) { - exception$(message, reader.getLineNum(), reader.getColNum(), args) ; - } - - private static void exception(PeekReader reader, String message, Object... args) { - exception$(message, reader.getLineNum(), reader.getColNum(), args) ; - } - - private static void exception$(String message, long line, long col, Object... args) { - throw new RiotParseException(String.format(message, args), line, col) ; + private void error(String message, Object... args) { + String msg = String.format(message, args) ; + long line = reader.getLineNum() ; + long col = reader.getColNum() ; + errorHandler.error(msg, line, col) ; + // We require that errors cause the tokenizer to stop so in case the + // provided error handler does not, we throw an exception. + throw new RiotParseException(message, line, col) ; } } From 6b932a567cd829fc482d11139c7b6615c33bb48a Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Thu, 27 Oct 2016 10:21:30 +0100 Subject: [PATCH 2/3] Reformat --- .../apache/jena/riot/lang/LangNTriples.java | 21 +++------ .../org/apache/jena/riot/lang/LangNTuple.java | 47 ++++++++----------- 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java index 535c3f863cc..37421295f6d 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java @@ -39,10 +39,7 @@ public final class LangNTriples extends LangNTuple { private static Logger messageLog = LoggerFactory.getLogger("N-Triples") ; - public LangNTriples(Tokenizer tokens, - ParserProfile profile, - StreamRDF dest) - { + public LangNTriples(Tokenizer tokens, ParserProfile profile, StreamRDF dest) { super(tokens, profile, dest) ; } @@ -51,19 +48,16 @@ public LangNTriples(Tokenizer tokens, /** Method to parse the whole stream of triples, sending each to the sink */ @Override - protected final void runParser() - { - while(hasNext()) - { - Triple x = parseOne() ; + protected final void runParser() { + while (hasNext()) { + Triple x = parseOne(); if ( x != null ) - dest.triple(x) ; + dest.triple(x); } } @Override - protected final Triple parseOne() - { + protected final Triple parseOne() { Token sToken = nextToken() ; if ( sToken.isEOF() ) exception(sToken, "Premature end of file: %s", sToken) ; @@ -92,8 +86,7 @@ protected final Triple parseOne() } @Override - protected final Node tokenAsNode(Token token) - { + protected final Node tokenAsNode(Token token) { return profile.create(null, token) ; } } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java index 479e0aa022b..aed16d380ae 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java @@ -53,24 +53,19 @@ public abstract class LangNTuple extends LangBase implements Iterator protected boolean skipOnBadTerm = false ; - protected LangNTuple(Tokenizer tokens, - ParserProfile profile, - StreamRDF dest) - { - super(tokens, profile, dest) ; + protected LangNTuple(Tokenizer tokens, ParserProfile profile, StreamRDF dest) { + super(tokens, profile, dest); } // Assumes no syntax errors. @Override - public final boolean hasNext() - { - return super.moreTokens() ; + public final boolean hasNext() { + return super.moreTokens(); } @Override - public final X next() - { - return parseOne() ; + public final X next() { + return parseOne(); } @Override @@ -81,30 +76,28 @@ public final void remove() protected abstract X parseOne() ; /** Note a tuple not being output */ - protected void skipOne(X object, String printForm, long line, long col) - { - profile.getHandler().warning("Skip: "+printForm, line, col) ; + protected void skipOne(X object, String printForm, long line, long col) { + profile.getHandler().warning("Skip: " + printForm, line, col); } protected abstract Node tokenAsNode(Token token) ; - protected final void checkIRIOrBNode(Token token) - { - if ( token.hasType(TokenType.IRI) ) return ; - if ( token.hasType(TokenType.BNODE) ) return ; - exception(token, "Expected BNode or IRI: Got: %s", token) ; + protected final void checkIRIOrBNode(Token token) { + if ( token.hasType(TokenType.IRI) ) + return; + if ( token.hasType(TokenType.BNODE) ) + return; + exception(token, "Expected BNode or IRI: Got: %s", token); } - protected final void checkIRI(Token token) - { - if ( token.hasType(TokenType.IRI) ) return ; - exception(token, "Expected IRI: Got: %s", token) ; + protected final void checkIRI(Token token) { + if ( token.hasType(TokenType.IRI) ) + return; + exception(token, "Expected IRI: Got: %s", token); } - protected final void checkRDFTerm(Token token) - { - switch(token.getType()) - { + protected final void checkRDFTerm(Token token) { + switch (token.getType()) { case IRI: case BNODE: case STRING2: From e11f1850f6c7c3caad37cfbfd66ae980242a2d71 Mon Sep 17 00:00:00 2001 From: Andy Seaborne Date: Thu, 27 Oct 2016 22:49:04 +0100 Subject: [PATCH 3/3] static default error handler --- .../apache/jena/riot/tokens/TokenizerText.java | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java index eb6e7075b67..af89d05cf14 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java @@ -61,31 +61,30 @@ public final class TokenizerText implements Tokenizer private boolean finished = false ; private TokenChecker checker = null ; - // The code assumes that errors throw exception and so stop parsing. - private ErrorHandler errorHandler = new ErrorHandler() { - @Override - public void warning(String message, long line, long col) { + private static class ErrorHandlerTokenizer implements ErrorHandler { + @Override public void warning(String message, long line, long col) { // Warning/continue. //ErrorHandlerFactory.errorHandlerStd.warning(message, line, col); throw new RiotParseException(message, line, col) ; } - @Override - public void error(String message, long line, long col) { + @Override public void error(String message, long line, long col) { throw new RiotParseException(message, line, col) ; } - @Override - public void fatal(String message, long line, long col) { + @Override public void fatal(String message, long line, long col) { throw new RiotParseException(message, line, col) ; } } ; + // The code assumes that errors throw exception and so stop parsing. + private static final ErrorHandler defaultErrorHandler = new ErrorHandlerTokenizer() ; + private ErrorHandler errorHandler = defaultErrorHandler ; /*package*/ TokenizerText(PeekReader reader) { this(reader, false) ; } - /* package */TokenizerText(PeekReader reader, boolean lineMode) { + /*package*/ TokenizerText(PeekReader reader, boolean lineMode) { this.reader = reader ; this.lineMode = lineMode ; }