From 4e3011a4d80545f04563f427687f4fa74e17103f Mon Sep 17 00:00:00 2001 From: Hans Date: Wed, 1 Aug 2018 16:06:55 -0500 Subject: [PATCH 1/3] ANY23-380 disallow duplicate attribute keys --- .../any23/extractor/rdf/BaseRDFExtractor.java | 38 +- .../extractor/rdfa/RDFa11ExtractorTest.java | 6 + .../rdfa/attribute-already-specified.html | 567 ++++++++++++++++++ 3 files changed, 601 insertions(+), 10 deletions(-) create mode 100644 test-resources/src/test/resources/html/rdfa/attribute-already-specified.html diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 84c53c7ea..9e2441265 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -46,6 +46,7 @@ import java.io.PushbackInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.HashSet; import java.util.Iterator; import java.util.regex.Pattern; @@ -131,26 +132,43 @@ public void run( // See https://issues.apache.org/jira/browse/ANY23-317 // and https://issues.apache.org/jira/browse/ANY23-340 NodeTraversor.filter(new NodeFilter() { + final HashSet tmpAttributeKeys = new HashSet<>(); + @Override public FilterResult head(Node node, int depth) { if (node instanceof Element) { + HashSet attributeKeys = tmpAttributeKeys; for (Iterator it = node.attributes().iterator(); it.hasNext(); ) { // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.] Attribute attr = it.next(); - String key = attr.getKey().replaceAll("[^-a-zA-Z0-9_:.]", ""); - - // fix for ANY23-347: strip xml namespaces - int prefixlen = key.lastIndexOf(':') + 1; - String prefix = key.substring(0, prefixlen).toLowerCase(); - key = (prefix.equals("xmlns:") || prefix.equals("xml:") ? prefix : "") - + key.substring(prefixlen); - - if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) { - attr.setKey(key); + String oldKey = attr.getKey(); + String newKey = oldKey.replaceAll("[^-a-zA-Z0-9_:.]", ""); + + // fix for ANY23-347: strip non-reserved xml namespaces + // See https://www.w3.org/TR/xml-names/#sec-namespaces + // "All other prefixes beginning with the three-letter sequence x, m, l, + // in any case combination, are reserved. This means that: + // * users SHOULD NOT use them except as defined by later specifications + // * processors MUST NOT treat them as fatal errors." + int prefixlen = oldKey.lastIndexOf(':') + 1; + String prefix = newKey.substring(0, prefixlen).toLowerCase(); + newKey = (prefix.startsWith("xml") ? prefix : "") + newKey.substring(prefixlen); + + if (newKey.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") + //the namespace name for "xmlns" MUST NOT be declared + //the namespace name for "xml" need not be declared + && !newKey.startsWith("xmlns:xml") + // fix for ANY23-380: disallow duplicate attribute keys + && attributeKeys.add(newKey)) { + //avoid indexOf() operation if possible + if (!newKey.equals(oldKey)) { + attr.setKey(newKey); + } } else { it.remove(); } } + attributeKeys.clear(); String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", ""); tagName = tagName.substring(tagName.lastIndexOf(':') + 1); diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java index f504dc54d..c3d7fd528 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java @@ -76,6 +76,12 @@ public void testInvalidXMLCharacter() { assertModelNotEmpty(); } + @Test + public void testAttributeAlreadySpecified() { + assertExtract("/html/rdfa/attribute-already-specified.html"); + assertModelNotEmpty(); + } + @Test public void test0087() { assertExtract("/html/rdfa/0087.xhtml"); diff --git a/test-resources/src/test/resources/html/rdfa/attribute-already-specified.html b/test-resources/src/test/resources/html/rdfa/attribute-already-specified.html new file mode 100644 index 000000000..509eeba39 --- /dev/null +++ b/test-resources/src/test/resources/html/rdfa/attribute-already-specified.html @@ -0,0 +1,567 @@ + + + + + + Bilder kirche: 554 gut bewertete Fotos der Lokalkompass Bürgerreporter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +
+
+ +
+
+
+ + + + + +
+
+
+ + +
+ +
+ +
+ +
+ + + +
+
+ +

Gut bewertete Bilder zum Thema kirche

+ +
+
+ +
+
+ + + + 28
+
+ +
+
+ +
+
+ + + + 27
+
+ +
+
+ +
+
+ + + + 25
+
+ +
+
+ +
+
+ + + + 22
+
+ +
+
+ +
+
+ + + + 22
+
+ +
+
+ +
+
+ + + + 21
+
+ +
+
+ +
+
+ + + + 21
+
+ +
+
+ +
+
+ + + + 20
+
+ +
+
+ +
+
+ + + + 20
+
+ +
+
+ +
+
+ + + + 19
+
+ +
+
+ +
+
+ + + + 18
+
+ +
+
+ +
+
+ + + + 18
+
+ +
+
+ + +
+
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + From 159aeb489473f600213142a746d39a49e3d3548b Mon Sep 17 00:00:00 2001 From: Hans Date: Thu, 2 Aug 2018 12:46:44 -0500 Subject: [PATCH 2/3] cleaned up annoying logging/console output --- .../java/org/apache/any23/cli/BaseTool.java | 83 +++++++++++++++++++ .../any23/cli/ExtractorDocumentation.java | 44 ++++++---- .../org/apache/any23/cli/MicrodataParser.java | 17 +++- .../org/apache/any23/cli/MimeDetector.java | 17 +++- .../org/apache/any23/cli/PluginVerifier.java | 18 +++- .../main/java/org/apache/any23/cli/Rover.java | 12 ++- .../java/org/apache/any23/cli/ToolRunner.java | 13 ++- .../org/apache/any23/cli/VocabPrinter.java | 18 +++- .../apache/any23/cli/MicrodataParserTest.java | 2 +- .../apache/any23/cli/MimeDetectorTest.java | 4 +- .../java/org/apache/any23/cli/RoverTest.java | 2 - .../org/apache/any23/cli/SimpleRoverTest.java | 38 +++++---- .../org/apache/any23/cli/ToolTestBase.java | 7 +- .../org/apache/any23/cli/YAMLRoverTest.java | 9 +- .../extractor/SingleDocumentExtraction.java | 4 +- .../html/AbstractExtractorTestCase.java | 11 ++- .../extractor/rdfa/RDFa11ExtractorTest.java | 4 +- .../any23/vocab/RDFSchemaUtilsTest.java | 12 +-- .../officescraper/ExcelExtractorTest.java | 2 +- 19 files changed, 245 insertions(+), 72 deletions(-) create mode 100644 cli/src/main/java/org/apache/any23/cli/BaseTool.java diff --git a/cli/src/main/java/org/apache/any23/cli/BaseTool.java b/cli/src/main/java/org/apache/any23/cli/BaseTool.java new file mode 100644 index 000000000..d9a82f584 --- /dev/null +++ b/cli/src/main/java/org/apache/any23/cli/BaseTool.java @@ -0,0 +1,83 @@ +package org.apache.any23.cli; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; + +abstract class BaseTool implements Tool { + + abstract PrintStream getOut(); + abstract void setOut(PrintStream out); + + void run(boolean concise) throws Exception { + PrintStream out = concise(getOut(), concise); + setOut(out); + try { + run(); + } finally { + close(out); + } + } + + private static void close(PrintStream stream) { + if (stream != null && stream != System.out && stream != System.err) { + try { + stream.close(); + } catch (Throwable th) { + //ignore + } + } + } + + private static PrintStream concise(PrintStream out, boolean concise) { + return (concise && (out == System.out || out == System.err)) ? new ConcisePrintStream(out) + : (out instanceof ConcisePrintStream ? ((ConcisePrintStream) out).out : out); + } + + private static final class ConcisePrintStream extends PrintStream { + + private PrintStream out; + + private ConcisePrintStream(PrintStream out) { + super(new OutputStream() { + StringBuilder sb = new StringBuilder(); + int lineCount; + boolean truncated = false; + @Override + public void write(int b) throws IOException { + if (sb == null) { + throw new IOException("stream closed"); + } + if (b == '\n') { + lineCount++; + } + if (lineCount == 0 && sb.length() < 200) { + sb.append((char)b); + } else if (!Character.isWhitespace(b)) { + truncated = true; + } + } + + @Override + public void close() { + if (sb == null) { + return; + } + if (truncated) { + sb.append("..."); + } + if (lineCount > 1) { + sb.append("\n...\n[Suppressed ").append(lineCount).append(" lines of output.]"); + } + + out.println(sb); + sb = null; + BaseTool.close(out); + } + }, true); + this.out = out; + } + + } + +} diff --git a/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java index f0fbeeafb..d531c2dc3 100644 --- a/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java +++ b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java @@ -30,13 +30,14 @@ import org.apache.any23.extractor.ExtractorRegistry; import java.io.IOException; +import java.io.PrintStream; /** * This class provides some command-line documentation * about available extractors and their usage. */ @Parameters( commandNames = { "extractor" }, commandDescription= "Utility for obtaining documentation about metadata extractors.") -public class ExtractorDocumentation implements Tool { +public class ExtractorDocumentation extends BaseTool { @Parameter( names = { "-l", "--list" }, description = "shows the names of all available extractors" ) private boolean showList; @@ -50,6 +51,19 @@ public class ExtractorDocumentation implements Tool { @Parameter( names = { "-a", "--all" }, description = "shows a report about all available extractors" ) private boolean showAll; + private PrintStream out = System.out; + + @Override + PrintStream getOut() { + return out; + } + + @Override + void setOut(PrintStream out) { + this.out = out; + } + + @Override public void run() throws Exception { if (showList) { printExtractorList(ExtractorRegistryImpl.getInstance()); @@ -78,7 +92,7 @@ public void printError(String msg) { */ public void printExtractorList(ExtractorRegistry registry) { for (ExtractorFactory factory : registry.getExtractorGroup()) { - System.out.println( String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel())); + out.println(String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel())); } } @@ -97,7 +111,7 @@ public void printExampleInput(String extractorName, ExtractorRegistry registry) if (input == null) { throw new IllegalArgumentException("Extractor " + extractorName + " provides no example input"); } - System.out.println(input); + out.println(input); } /** @@ -116,7 +130,7 @@ public void printExampleOutput(String extractorName, ExtractorRegistry registry) if (output == null) { throw new IllegalArgumentException("Extractor " + extractorName + " provides no example output"); } - System.out.println(output); + out.println(output); } /** @@ -131,21 +145,21 @@ public void printReport(ExtractorRegistry registry) throws IOException, Extracti for (String extractorName : registry.getAllNames()) { ExtractorFactory factory = registry.getFactory(extractorName); ExampleInputOutput example = new ExampleInputOutput(factory); - System.out.println("Extractor: " + extractorName); - System.out.println("\ttype: " + getType(factory)); - System.out.println(); + out.println("Extractor: " + extractorName); + out.println("\ttype: " + getType(factory)); + out.println(); final String exampleInput = example.getExampleInput(); - if(exampleInput == null) { - System.out.println("(No Example Available)"); + if (exampleInput == null) { + out.println("(No Example Available)"); } else { - System.out.println("-------- Example Input --------"); - System.out.println(exampleInput); - System.out.println("-------- Example Output --------"); + out.println("-------- Example Input --------"); + out.println(exampleInput); + out.println("-------- Example Output --------"); String output = example.getExampleOutput(); - System.out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output); + out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output); } - System.out.println("================================"); - System.out.println(); + out.println("================================"); + out.println(); } } diff --git a/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java index 19c59bf32..8655c5a95 100644 --- a/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java +++ b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java @@ -30,6 +30,7 @@ import java.io.File; import java.io.InputStream; +import java.io.PrintStream; import java.net.URISyntaxException; import java.util.LinkedList; import java.util.List; @@ -44,7 +45,7 @@ * @author Michele Mostarda (mostarda@fbk.eu) */ @Parameters( commandNames = { "microdata" }, commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.") -public class MicrodataParser implements Tool { +public class MicrodataParser extends BaseTool { private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*"); @@ -57,6 +58,18 @@ public class MicrodataParser implements Tool { ) private List document = new LinkedList(); + private PrintStream out = System.out; + + @Override + PrintStream getOut() { + return out; + } + + @Override + void setOut(PrintStream out) { + this.out = out; + } + public void run() throws Exception { if (document.isEmpty()) { throw new IllegalArgumentException("No input document URL specified"); @@ -69,7 +82,7 @@ public void run() throws Exception { documentInputInputStream, documentSource.getDocumentIRI() ); - org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out); + org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), out); } finally { if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream); } diff --git a/cli/src/main/java/org/apache/any23/cli/MimeDetector.java b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java index c9072cba3..d4ec916c3 100644 --- a/cli/src/main/java/org/apache/any23/cli/MimeDetector.java +++ b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java @@ -32,6 +32,7 @@ import org.apache.any23.source.StringDocumentSource; import java.io.File; +import java.io.PrintStream; import java.net.URISyntaxException; import java.util.LinkedList; import java.util.List; @@ -44,7 +45,7 @@ * @author Michele Mostarda (mostarda@fbk.eu) */ @Parameters(commandNames = { "mimes" }, commandDescription = "MIME Type Detector Tool.") -public class MimeDetector implements Tool{ +public class MimeDetector extends BaseTool { public static final String FILE_DOCUMENT_PREFIX = "file://"; @@ -59,6 +60,18 @@ public class MimeDetector implements Tool{ ) private List document = new LinkedList(); + private PrintStream out = System.out; + + @Override + PrintStream getOut() { + return out; + } + + @Override + void setOut(PrintStream out) { + this.out = out; + } + public void run() throws Exception { if (document.isEmpty()) { throw new IllegalArgumentException("No input document URL specified"); @@ -71,7 +84,7 @@ public void run() throws Exception { documentSource.openInputStream(), MIMEType.parse(documentSource.getContentType()) ); - System.out.println(mimeType); + out.println(mimeType); } public static final class MimeDetectorDocumentSourceConverter implements IStringConverter { diff --git a/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java index a43065d16..ae0868bcd 100644 --- a/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java +++ b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java @@ -38,7 +38,7 @@ * @author Michele Mostarda (mostarda@fbk.eu) */ @Parameters(commandNames = { "verify" }, commandDescription = "Utility for plugin management verification.") -public class PluginVerifier implements Tool { +public class PluginVerifier extends BaseTool { private Any23PluginManager pluginManager = Any23PluginManager.getInstance(); @@ -48,6 +48,18 @@ public class PluginVerifier implements Tool { ) private List pluginsDirs = new LinkedList<>(); + private PrintStream out = System.out; + + @Override + PrintStream getOut() { + return out; + } + + @Override + void setOut(PrintStream out) { + this.out = out; + } + public void run() throws Exception { if (pluginsDirs.isEmpty()) { throw new IllegalArgumentException("No plugin directory specified."); @@ -63,8 +75,8 @@ public void run() throws Exception { final Iterator plugins = pluginManager.getExtractors(); while (plugins.hasNext()) { - printPluginData(plugins.next(), System.out); - System.out.println("------------------------------------------------------------------------"); + printPluginData(plugins.next(), out); + out.println("------------------------------------------------------------------------"); } } diff --git a/cli/src/main/java/org/apache/any23/cli/Rover.java b/cli/src/main/java/org/apache/any23/cli/Rover.java index 18f0c0634..5b49b393c 100644 --- a/cli/src/main/java/org/apache/any23/cli/Rover.java +++ b/cli/src/main/java/org/apache/any23/cli/Rover.java @@ -59,7 +59,7 @@ * @author Gabriele Renzi */ @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.") -public class Rover implements Tool { +public class Rover extends BaseTool { private static final List FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers(); @@ -117,6 +117,16 @@ public class Rover implements Tool { private ExtractionParameters extractionParameters; + @Override + PrintStream getOut() { + return outputStream; + } + + @Override + void setOut(PrintStream out) { + outputStream = out; + } + protected void configure() { try { tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream); diff --git a/cli/src/main/java/org/apache/any23/cli/ToolRunner.java b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java index b875ec7ee..a19ecf1c4 100644 --- a/cli/src/main/java/org/apache/any23/cli/ToolRunner.java +++ b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java @@ -69,7 +69,11 @@ public static void main( String[] args ) throws Exception { exit( new ToolRunner().execute( args ) ); } - public int execute(String...args) throws Exception { + public int execute(String... args) throws Exception { + return execute(false, args); + } + + int execute(boolean concise, String... args) throws Exception { JCommander commander = new JCommander(this); commander.setProgramName(System.getProperty("app.name")); @@ -133,7 +137,12 @@ public int execute(String...args) throws Exception { infoStream.println(); try { - Tool.class.cast( commands.get( parsedCommand ).getObjects().get( 0 ) ).run(); + Tool tool = Tool.class.cast(commands.get(parsedCommand).getObjects().get(0)); + if (tool instanceof BaseTool) { + ((BaseTool) tool).run(concise); + } else { + tool.run(); + } } catch (Throwable t) { exit = 1; error = t; diff --git a/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java index 7fde88733..f3126af24 100644 --- a/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java +++ b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java @@ -26,19 +26,33 @@ import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; +import java.io.PrintStream; + /** * Prints out the vocabulary RDFSchema as NQuads. * * @author Michele Mostarda (mostarda@fbk.eu) */ @Parameters(commandNames = { "vocab" }, commandDescription = "Prints out the RDF Schema of the vocabularies used by Any23.") -public class VocabPrinter implements Tool { +public class VocabPrinter extends BaseTool { @Parameter(names = { "-f", "--format" }, description = "Vocabulary output format", converter = RDFFormatConverter.class) private RDFFormat format = RDFFormat.NQUADS; + private PrintStream out = System.out; + + @Override + PrintStream getOut() { + return out; + } + + @Override + void setOut(PrintStream out) { + this.out = out; + } + public void run() throws Exception { - RDFSchemaUtils.serializeVocabularies(format, System.out); + RDFSchemaUtils.serializeVocabularies(format, out); } public static final class RDFFormatConverter implements diff --git a/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java index 2f46aaae2..e0a11233d 100644 --- a/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java +++ b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java @@ -32,7 +32,7 @@ public MicrodataParserTest() { @Test public void testRunOnFile() throws Exception { - runToolCheckExit0("file:"+copyResourceToTempFile("/microdata/microdata-nested.html").getAbsolutePath()); + runToolCheckExit0("file:" + copyResourceToTempFile("/microdata/microdata-nested.html").getAbsolutePath()); } @Test diff --git a/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java index 3894d32f3..46b65daf7 100644 --- a/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java +++ b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java @@ -39,13 +39,13 @@ public void testDetectURL() throws Exception { @Test public void testDetectFile() throws Exception { assumeOnlineAllowed(); - runToolCheckExit0("file://"+copyResourceToTempFile("/application/trix/test1.trx").getAbsolutePath()); + runToolCheckExit0("file://" + copyResourceToTempFile("/application/trix/test1.trx").getAbsolutePath()); } @Test public void testDetectInline() throws Exception { assumeOnlineAllowed(); - runToolCheckExit0( new String[] {"inline:// ."} ); + runToolCheckExit0("inline:// ."); } } diff --git a/cli/src/test/java/org/apache/any23/cli/RoverTest.java b/cli/src/test/java/org/apache/any23/cli/RoverTest.java index 7bab3141f..15054e4d0 100644 --- a/cli/src/test/java/org/apache/any23/cli/RoverTest.java +++ b/cli/src/test/java/org/apache/any23/cli/RoverTest.java @@ -28,7 +28,6 @@ import org.eclipse.rdf4j.rio.RDFFormat; import java.io.File; -import java.util.Arrays; /** * Test case for {@link Rover}. @@ -130,7 +129,6 @@ private void runWithMultiSourcesAndVerify(String[] targets, int expectedExit) th final String outNQuads = FileUtils.readFileContent(outFile); final Statement[] statements = RDFUtils.parseRDF(RDFFormat.NQUADS, outNQuads); - System.out.println(Arrays.toString(statements)); Assert.assertTrue("Unexpected number of statements.", statements.length > 9); } diff --git a/cli/src/test/java/org/apache/any23/cli/SimpleRoverTest.java b/cli/src/test/java/org/apache/any23/cli/SimpleRoverTest.java index ec75022a7..587dda768 100644 --- a/cli/src/test/java/org/apache/any23/cli/SimpleRoverTest.java +++ b/cli/src/test/java/org/apache/any23/cli/SimpleRoverTest.java @@ -16,12 +16,13 @@ */ package org.apache.any23.cli; -import com.google.common.io.Files; import java.io.File; +import java.lang.invoke.MethodHandles; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collection; -import org.apache.any23.util.FileUtils; -import org.apache.pdfbox.util.Charsets; + +import org.apache.commons.io.FileUtils; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -38,7 +39,7 @@ public class SimpleRoverTest extends ToolTestBase { private static final String baseUri = "urn:test"; - private static final Logger log = LoggerFactory.getLogger(SimpleRoverTest.class); + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private String filePath; @@ -57,8 +58,7 @@ public SimpleRoverTest(String filePath) { } /** - * Ref {@link https://issues.apache.org/jira/browse/ANY23-310} unit test. - * @throws Exception + * Ref ANY23-310 unit test. */ @Test public void ref310Test() @@ -75,20 +75,23 @@ public void ref310Test() Assert.assertTrue(logfile.exists()); Assert.assertTrue(outputFile.exists()); // check if output file is longer than 10 chracters - String outputFileContent = FileUtils.readFileContent(outputFile); + String outputFileContent = FileUtils.readFileToString(outputFile, StandardCharsets.UTF_8); Assert.assertTrue(outputFileContent.length() > 10); - String[] logFileContent = FileUtils.readFileLines(logfile); - Assert.assertTrue(logFileContent.length == 2); + String[] logFileContent = FileUtils.readLines(logfile, StandardCharsets.UTF_8).toArray(new String[0]); + Assert.assertEquals(2, logFileContent.length); //Assert.assertTrue(logFileContent[1].split("\\W*")[1] == ); int contentSize = Integer.valueOf(logFileContent[1].split("\\t")[1]); - log.info("Content: '{}'", contentSize); String extractors = logFileContent[1].split("\\t")[4].replaceAll("[\\[\\]\\s:\\d]", ""); - log.info("Extractors: '{}'", extractors); - - - log.debug("Log file location: {}", logfile.getAbsolutePath()); - log.trace("Log file content: \n{}\n", Files.toString(logfile, Charsets.UTF_8)); + + if (log.isDebugEnabled()) { + log.debug("Content: '{}'", contentSize); + log.debug("Extractors: '{}'", extractors); + log.debug("Log file location: {}", logfile.getAbsolutePath()); + } + if (log.isTraceEnabled()) { + log.trace("Log file content: \n{}\n", FileUtils.readFileToString(logfile, StandardCharsets.UTF_8)); + } Assert.assertTrue("Content size should be greated than 0", contentSize > 0); Assert.assertFalse(extractors.isEmpty()); @@ -96,12 +99,11 @@ public void ref310Test() } /** - * Ref {@link https://issues.apache.org/jira/browse/ANY23-310} unit test. + * Ref ANY23-310 unit test. * * Example without the logging file. * * By default that test is not active. It might be useful for debugging. - * @throws Exception */ @Test public void ref310ExtendedTest() @@ -115,7 +117,7 @@ public void ref310ExtendedTest() Assert.assertTrue(outputFile.exists()); // check if output file is longer than 10 chracters - String outputFileContent = FileUtils.readFileContent(outputFile); + String outputFileContent = FileUtils.readFileToString(outputFile, StandardCharsets.UTF_8); Assert.assertTrue(outputFileContent.length() > 10); diff --git a/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java index fef49cdab..a81531565 100644 --- a/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java +++ b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java @@ -21,6 +21,7 @@ import org.apache.any23.Any23OnlineTestBase; import java.util.Arrays; +import java.util.Objects; import static java.lang.String.format; import static org.junit.Assert.assertEquals; @@ -30,7 +31,6 @@ * * @author Michele Mostarda (mostarda@fbk.eu) */ -// TODO: improve support for Tool testing, intercept i/o streams. public abstract class ToolTestBase extends Any23OnlineTestBase { public static final String TOOL_RUN_METHOD = "run"; @@ -38,8 +38,7 @@ public abstract class ToolTestBase extends Any23OnlineTestBase { private final Class toolClazz; protected ToolTestBase(Class tool) { - if (tool == null) throw new NullPointerException(); - toolClazz = tool; + toolClazz = Objects.requireNonNull(tool, "Tool class cannot be null."); } /** @@ -56,7 +55,7 @@ protected int runTool(String... args) throws Exception { enhancedArgs[0] = commandName; System.arraycopy( args, 0, enhancedArgs, 1, args.length ); - return new ToolRunner().execute( enhancedArgs ); + return new ToolRunner().execute(true, enhancedArgs); } /** diff --git a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java index f559f6794..6024110dc 100644 --- a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java +++ b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java @@ -16,10 +16,11 @@ */ package org.apache.any23.cli; -import com.google.common.io.Files; import java.io.File; import java.io.IOException; -import org.apache.pdfbox.util.Charsets; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.FileUtils; import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; @@ -56,7 +57,7 @@ public void simpleTest() Assert.assertTrue(logfile.exists()); log.debug("Log file location: {}", logfile.getAbsolutePath()); - log.info("Log file content: \n{}\n", Files.toString(logfile, Charsets.UTF_8)); + log.debug("Log file content: \n{}\n", FileUtils.readFileToString(logfile, StandardCharsets.UTF_8)); Assert.assertEquals("Unexpected exit code.", 0, exitCode); assertFileContainsString(outputFile, baseUri); @@ -72,7 +73,7 @@ public void simpleTest() * @return */ public void assertFileContainsString(File f, String s) throws IOException { - String fileContent = Files.toString(f, Charsets.UTF_8); + String fileContent = FileUtils.readFileToString(f, StandardCharsets.UTF_8); log.trace("File content: \n{}\n", fileContent); Assert.assertTrue(fileContent.contains(s)); } diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java index 028b518b1..77ed28cfb 100644 --- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java +++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java @@ -218,8 +218,8 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet } catch (Exception ex) { throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex); } - if(log.isInfoEnabled()) { - log.info("Processing " + this.documentIRI); + if (log.isDebugEnabled()) { + log.debug("Processing " + this.documentIRI); } filterExtractorsByMIMEType(); diff --git a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java index f04d59ffc..d9e0fe516 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java +++ b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java @@ -48,9 +48,12 @@ import org.eclipse.rdf4j.rio.Rio; import org.eclipse.rdf4j.sail.Sail; import org.eclipse.rdf4j.sail.memory.MemoryStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.StringWriter; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -63,6 +66,8 @@ */ public abstract class AbstractExtractorTestCase extends AbstractAny23TestBase { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + /** * Base test document. */ @@ -363,11 +368,11 @@ protected void assertNoIssues() { for (Map.Entry> entry : report .getExtractorToIssues().entrySet()) { if (entry.getValue().size() > 0) { - System.out.println("Unexpected issue for extractor " + entry.getKey() + log.debug("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue()); } - for(Issue nextIssue : entry.getValue()) { - if(nextIssue.getLevel() == IssueLevel.ERROR || nextIssue.getLevel() == IssueLevel.FATAL) { + for (Issue nextIssue : entry.getValue()) { + if (nextIssue.getLevel() == IssueLevel.ERROR || nextIssue.getLevel() == IssueLevel.FATAL) { Assert.fail("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue()); } diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java index c3d7fd528..1bc63d479 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java @@ -292,7 +292,7 @@ public void testRDFa11Extraction() @Test public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException { assertExtract("/html/rdfa/opengraph-structured-properties.html"); - logger.info( dumpHumanReadableTriples() ); + logger.debug(dumpHumanReadableTriples()); Assert.assertEquals(31, getStatementsSize(null, null, null) ); final OGP vOGP = OGP.getInstance(); @@ -328,7 +328,7 @@ protected ExtractorFactory getExtractorFactory() { @Test public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException { assertExtract("/html/rdfa/opengraph-music-song-object-type.html"); - logger.info( dumpHumanReadableTriples() ); + logger.debug(dumpHumanReadableTriples()); Assert.assertEquals(9, getStatementsSize(null, null, null) ); final OGPMusic vOGPMusic = OGPMusic.getInstance(); diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index c56641ab0..d8fd8d533 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -39,7 +39,7 @@ public class RDFSchemaUtilsTest { /** * Test case for * {@link RDFSchemaUtils#serializeVocabularies( - * org.apache.any23.vocab.RDFSchemaUtils.VocabularyFormat, java.io.PrintStream)} with NTriples format. + * org.eclipse.rdf4j.rio.RDFFormat, java.io.PrintStream)} with NTriples format. */ @Test public void testSerializeVocabulariesNTriples() { @@ -49,7 +49,7 @@ public void testSerializeVocabulariesNTriples() { /** * Test case for * {@link RDFSchemaUtils#serializeVocabularies( - * org.apache.any23.vocab.RDFSchemaUtils.VocabularyFormat, java.io.PrintStream)} with RDFXML format. + * org.eclipse.rdf4j.rio.RDFFormat, java.io.PrintStream)} with RDFXML format. */ @Test public void testSerializeVocabulariesRDFXML() { @@ -58,12 +58,12 @@ public void testSerializeVocabulariesRDFXML() { private void serializeVocabularies(RDFFormat format, int expectedLines) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final PrintStream ps = new PrintStream(baos); - RDFSchemaUtils.serializeVocabularies(format, ps); - ps.close(); + try (PrintStream ps = new PrintStream(baos)) { + RDFSchemaUtils.serializeVocabularies(format, ps); + } final String output = baos.toString(); logger.debug(output); - final int occurrences= StringUtils.countOccurrences(output, "\n"); + final int occurrences = StringUtils.countOccurrences(output, "\n"); Assert.assertEquals(expectedLines, occurrences); } diff --git a/plugins/office-scraper/src/test/java/org/apache/any23/plugin/officescraper/ExcelExtractorTest.java b/plugins/office-scraper/src/test/java/org/apache/any23/plugin/officescraper/ExcelExtractorTest.java index 86a16fe1e..3edebbfe0 100644 --- a/plugins/office-scraper/src/test/java/org/apache/any23/plugin/officescraper/ExcelExtractorTest.java +++ b/plugins/office-scraper/src/test/java/org/apache/any23/plugin/officescraper/ExcelExtractorTest.java @@ -94,7 +94,7 @@ private void processFile(String resource) throws IOException, ExtractionExceptio ); extractor.run(extractionParameters, extractionContext, is, extractionResult); compositeTripleHandler.close(); - logger.info(out.toString()); + logger.debug(out.toString()); verifyPredicateOccurrence(verifierTripleHandler, Excel.getInstance().containsSheet, 2 ); verifyPredicateOccurrence(verifierTripleHandler, Excel.getInstance().containsRow , 6 ); From 0291f588d04859053ef4eb8845686bad824b4461 Mon Sep 17 00:00:00 2001 From: Hans Date: Thu, 2 Aug 2018 13:01:19 -0500 Subject: [PATCH 3/3] added license and javadoc --- .../java/org/apache/any23/cli/BaseTool.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cli/src/main/java/org/apache/any23/cli/BaseTool.java b/cli/src/main/java/org/apache/any23/cli/BaseTool.java index d9a82f584..6164158e6 100644 --- a/cli/src/main/java/org/apache/any23/cli/BaseTool.java +++ b/cli/src/main/java/org/apache/any23/cli/BaseTool.java @@ -1,9 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.any23.cli; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +/** + * This class reduces the verbosity of testing command-line + * console output by intercepting the underlying {@link PrintStream} + * when applicable and replacing it with a more concise version. + * + * @author Hans Brende (hansbrende@apache.org) + */ abstract class BaseTool implements Tool { abstract PrintStream getOut();