From 4c81edde390b6b6e91566f490ca5d915ca0b0945 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Wed, 27 Dec 2017 20:06:08 +0000 Subject: [PATCH 1/2] ANY23-318 ExtractionException handling in BaseRDFExtractor.java kills entire extraction --- .../any23/extractor/ExtractionParameters.java | 22 +-- .../main/java/org/apache/any23/cli/Rover.java | 4 +- .../extractor/SingleDocumentExtraction.java | 20 +-- .../any23/extractor/rdf/BaseRDFExtractor.java | 19 +-- .../DefaultValidationReportBuilder.java | 12 +- .../any23/validator/ValidationReport.java | 28 +++- .../any23/validator/rule/AboutNotURIRule.java | 8 +- .../validator/rule/MetaNameMisuseFix.java | 2 + .../validator/rule/MetaNameMisuseRule.java | 6 +- .../MissingItemscopeAttributeValueFix.java | 27 ++-- .../MissingItemscopeAttributeValueRule.java | 6 +- .../rule/MissingOpenGraphNamespaceRule.java | 4 +- .../validator/rule/OpenGraphNamespaceFix.java | 5 +- .../test/java/org/apache/any23/Any23Test.java | 8 +- .../any23/validator/DefaultValidatorTest.java | 17 +-- .../XMLValidationReportSerializerTest.java | 2 +- .../apache/any23/servlet/RedirectServlet.java | 51 +++++-- .../org/apache/any23/servlet/Servlet.java | 16 +-- .../apache/any23/servlet/WebResponder.java | 2 +- .../resources/microdata/microdata-basic.html | 129 ++++++++++-------- 20 files changed, 244 insertions(+), 144 deletions(-) diff --git a/api/src/main/java/org/apache/any23/extractor/ExtractionParameters.java b/api/src/main/java/org/apache/any23/extractor/ExtractionParameters.java index 2bd7e2bd1..96a6218f9 100644 --- a/api/src/main/java/org/apache/any23/extractor/ExtractionParameters.java +++ b/api/src/main/java/org/apache/any23/extractor/ExtractionParameters.java @@ -73,15 +73,15 @@ public ExtractionParameters( this.extractionFlags = extractionFlags == null ? - new HashMap() + new HashMap<>() : - new HashMap(extractionFlags); + new HashMap<>(extractionFlags); this.extractionProperties = extractionProperties == null ? - new HashMap() + new HashMap<>() : - new HashMap(extractionProperties); + new HashMap<>(extractionProperties); } /** @@ -122,7 +122,7 @@ public ExtractionParameters(Configuration configuration, ValidationMode extracti * @return the default extraction parameters. */ public static final ExtractionParameters newDefault(Configuration c) { - return new ExtractionParameters(c, ValidationMode.None); + return new ExtractionParameters(c, ValidationMode.NONE); } /** @@ -131,30 +131,30 @@ public static final ExtractionParameters newDefault(Configuration c) { * @return the default extraction parameters. */ public static final ExtractionParameters newDefault() { - return new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.None); + return new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE); } /** * Declares the supported validation actions. */ public enum ValidationMode { - None, - Validate, - ValidateAndFix + NONE, + VALIDATE, + VALIDATE_AND_FIX } /** * @return true if validation is active. */ public boolean isValidate() { - return extractionMode == ValidationMode.Validate || extractionMode == ValidationMode.ValidateAndFix; + return extractionMode == ValidationMode.VALIDATE || extractionMode == ValidationMode.VALIDATE_AND_FIX; } /** * @return true if fix is active. */ public boolean isFix() { - return extractionMode == ValidationMode.ValidateAndFix; + return extractionMode == ValidationMode.VALIDATE_AND_FIX; } /** diff --git a/cli/src/main/java/org/apache/any23/cli/Rover.java b/cli/src/main/java/org/apache/any23/cli/Rover.java index ffa1de07a..18f0c0634 100644 --- a/cli/src/main/java/org/apache/any23/cli/Rover.java +++ b/cli/src/main/java/org/apache/any23/cli/Rover.java @@ -154,9 +154,9 @@ protected void configure() { extractionParameters = pedantic ? - new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled) + new ExtractionParameters(configuration, ValidationMode.VALIDATE_AND_FIX, nestingDisabled) : - new ExtractionParameters(configuration, ValidationMode.None , nestingDisabled); + new ExtractionParameters(configuration, ValidationMode.NONE , nestingDisabled); if (defaultns != null) { extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY, defaultns); diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java index cd6fea7fc..9cee7a464 100644 --- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java +++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java @@ -73,7 +73,7 @@ public class SingleDocumentExtraction { private static final SINDICE vSINDICE = SINDICE.getInstance(); - private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class); + private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class); private final Configuration configuration; @@ -115,13 +115,15 @@ public class SingleDocumentExtraction { public SingleDocumentExtraction( Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output ) { - if(configuration == null) throw new NullPointerException("configuration cannot be null."); - if(in == null) throw new NullPointerException("in cannot be null."); + if(configuration == null) + throw new NullPointerException("configuration cannot be null."); + if(in == null) + throw new NullPointerException("in cannot be null."); this.configuration = configuration; this.in = in; this.extractors = extractors; - List tripleHandlers = new ArrayList(); + List tripleHandlers = new ArrayList<>(); tripleHandlers.add(output); tripleHandlers.add(new CountingTripleHandler()); this.output = new CompositeTripleHandler(tripleHandlers); @@ -222,7 +224,7 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet filterExtractorsByMIMEType(); if(log.isDebugEnabled()) { - StringBuffer sb = new StringBuffer("Extractors "); + StringBuilder sb = new StringBuilder("Extractors "); for (ExtractorFactory factory : matchingExtractors) { sb.append(factory.getExtractorName()); sb.append(' '); @@ -347,7 +349,7 @@ public boolean hasMatchingExtractors() throws IOException { */ @SuppressWarnings("rawtypes") public List getMatchingExtractors() { - final List extractorsList = new ArrayList(); + final List extractorsList = new ArrayList<>(); for(ExtractorFactory extractorFactory : matchingExtractors) { extractorsList.add( extractorFactory.createExtractor() ); } @@ -415,7 +417,8 @@ private String extractDocumentLanguage(ExtractionParameters extractionParameters */ private void filterExtractorsByMIMEType() throws IOException { - if (matchingExtractors != null) return; // has already been run. + if (matchingExtractors != null) + return; // has already been run. if (detector == null || extractors.allExtractorsSupportAllContentTypes()) { matchingExtractors = extractors; @@ -515,7 +518,8 @@ private SingleExtractionReport runExtractor( * @throws IOException */ private void ensureHasLocalCopy() throws IOException { - if (localDocumentSource != null) return; + if (localDocumentSource != null) + return; if (in.isLocal()) { localDocumentSource = in; return; diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 549cc1afa..6b9377e0f 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -22,12 +22,14 @@ import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; -import org.apache.any23.extractor.ExtractorDescription; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFParseException; import org.eclipse.rdf4j.rio.RDFParser; import org.eclipse.rdf4j.rio.RioSetting; import org.eclipse.rdf4j.rio.helpers.BasicParserSettings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; import java.io.InputStream; import java.util.HashSet; @@ -40,9 +42,14 @@ */ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { + private static final Logger LOG = LoggerFactory.getLogger(BaseRDFExtractor.class); private boolean verifyDataType; private boolean stopAtFirstError; + public BaseRDFExtractor() { + this(false, false); + } + /** * Constructor, allows to specify the validation and error handling policies. * @@ -56,17 +63,11 @@ public BaseRDFExtractor(boolean verifyDataType, boolean stopAtFirstError) { this.stopAtFirstError = stopAtFirstError; } - public abstract ExtractorDescription getDescription(); - protected abstract RDFParser getParser( ExtractionContext extractionContext, ExtractionResult extractionResult ); - public BaseRDFExtractor() { - this(false, false); - } - public boolean isVerifyDataType() { return verifyDataType; } @@ -79,10 +80,12 @@ public boolean isStopAtFirstError() { return stopAtFirstError; } + @Override public void setStopAtFirstError(boolean b) { stopAtFirstError = b; } + @Override public void run( ExtractionParameters extractionParameters, ExtractionContext extractionContext, @@ -106,7 +109,7 @@ public void run( } catch (RDFHandlerException ex) { throw new IllegalStateException("Unexpected exception.", ex); } catch (RDFParseException ex) { - throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult); + LOG.error("Error while parsing RDF document.", ex, extractionResult); } } diff --git a/core/src/main/java/org/apache/any23/validator/DefaultValidationReportBuilder.java b/core/src/main/java/org/apache/any23/validator/DefaultValidationReportBuilder.java index 8f73d8d29..bbab1b463 100644 --- a/core/src/main/java/org/apache/any23/validator/DefaultValidationReportBuilder.java +++ b/core/src/main/java/org/apache/any23/validator/DefaultValidationReportBuilder.java @@ -35,7 +35,9 @@ public class DefaultValidationReportBuilder implements ValidationReportBuilder { private List ruleActivations; private List errors; - public DefaultValidationReportBuilder() {} + public DefaultValidationReportBuilder() { + //default constructor + } public ValidationReport getReport() { return new DefaultValidationReport( @@ -47,7 +49,7 @@ public ValidationReport getReport() { public void reportIssue(ValidationReport.IssueLevel issueLevel, String message, Node n) { if(issues == null) { - issues = new ArrayList(); + issues = new ArrayList<>(); } issues.add( new ValidationReport.Issue(issueLevel, message, n) ); } @@ -58,21 +60,21 @@ public void reportIssue(ValidationReport.IssueLevel issueLevel, String message) public void traceRuleActivation(Rule r) { if(ruleActivations == null) { - ruleActivations = new ArrayList(); + ruleActivations = new ArrayList<>(); } ruleActivations.add( new ValidationReport.RuleActivation(r) ); } public void reportRuleError(Rule r, Exception e, String msg) { if(errors == null) { - errors = new ArrayList(); + errors = new ArrayList<>(); } errors.add( new ValidationReport.RuleError(r, e, msg) ); } public void reportFixError(Fix f, Exception e, String msg) { if(errors == null) { - errors = new ArrayList(); + errors = new ArrayList<>(); } errors.add( new ValidationReport.FixError(f, e, msg) ); diff --git a/core/src/main/java/org/apache/any23/validator/ValidationReport.java b/core/src/main/java/org/apache/any23/validator/ValidationReport.java index 7b478095d..56b3f1090 100644 --- a/core/src/main/java/org/apache/any23/validator/ValidationReport.java +++ b/core/src/main/java/org/apache/any23/validator/ValidationReport.java @@ -39,9 +39,9 @@ public interface ValidationReport extends Serializable { * Defines the different issue levels. */ enum IssueLevel { - error, - warning, - info + ERROR, + WARNING, + INFO } /** @@ -70,9 +70,13 @@ enum IssueLevel { */ class Issue implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; private final IssueLevel level; private final String message; - private final Node origin; + private final transient Node origin; public Issue(IssueLevel level, String message, Node origin) { if(level == null) { @@ -117,6 +121,10 @@ public String toString() { */ class RuleActivation implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; private final String ruleStr; public RuleActivation(Rule r) { @@ -141,6 +149,10 @@ public String toString() { */ abstract class Error implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; private final Exception cause; private final String message; @@ -174,6 +186,10 @@ public String toString() { */ class RuleError extends Error { + /** + * + */ + private static final long serialVersionUID = 1L; private final Rule origin; public RuleError(Rule r, Exception e, String msg) { @@ -199,6 +215,10 @@ public String toString() { */ class FixError extends Error { + /** + * + */ + private static final long serialVersionUID = 1L; private final Fix origin; public FixError(Fix f, Exception e, String msg) { diff --git a/core/src/main/java/org/apache/any23/validator/rule/AboutNotURIRule.java b/core/src/main/java/org/apache/any23/validator/rule/AboutNotURIRule.java index 2e709ed76..0275c4e97 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/AboutNotURIRule.java +++ b/core/src/main/java/org/apache/any23/validator/rule/AboutNotURIRule.java @@ -40,21 +40,23 @@ public class AboutNotURIRule implements Rule { public static final String NODES_WITH_INVALID_ABOUT = "nodes-with-invalid-about"; + @Override public String getHRName() { return "about-not-uri-rule"; } + @Override public boolean applyOn( DOMDocument document, - RuleContext context, + @SuppressWarnings("rawtypes") RuleContext context, ValidationReportBuilder validationReportBuilder ) { final List nodesWithAbout = document.getNodesWithAttribute("about"); - final List nodesWithInvalidAbout = new ArrayList(); + final List nodesWithInvalidAbout = new ArrayList<>(); for(Node nodeWithAbout : nodesWithAbout) { if ( ! aboutIsValid(nodeWithAbout) ) { validationReportBuilder.reportIssue( - ValidationReport.IssueLevel.error, + ValidationReport.IssueLevel.ERROR, "Invalid about value for node, expected valid URL.", nodeWithAbout ); diff --git a/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseFix.java b/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseFix.java index 5a0bfaee7..149ce5f3c 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseFix.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseFix.java @@ -34,10 +34,12 @@ */ public class MetaNameMisuseFix implements Fix { + @Override public String getHRName() { return "meta-name-misuse-fix"; } + @Override @SuppressWarnings("unchecked") public void execute(Rule rule, @SuppressWarnings("rawtypes") RuleContext context, DOMDocument document) { List nodes = (List) context.getData(MetaNameMisuseRule.ERRORED_META_NODES); diff --git a/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseRule.java b/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseRule.java index a803107be..1b965ec71 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseRule.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MetaNameMisuseRule.java @@ -38,10 +38,12 @@ public class MetaNameMisuseRule implements Rule { public static final String ERRORED_META_NODES = "errored-meta-nodes"; + @Override public String getHRName() { return "meta-name-misuse-rule"; } + @Override public boolean applyOn( DOMDocument document, @SuppressWarnings("rawtypes") RuleContext context, @@ -49,14 +51,14 @@ public boolean applyOn( ) { List metaNodes = document.getNodes("/HTML/HEAD/META"); boolean foundIssue = false; - final List wrongMetaNodes = new ArrayList(); + final List wrongMetaNodes = new ArrayList<>(); for(Node metaNode : metaNodes) { Node nameNode = metaNode.getAttributes().getNamedItem("name"); if(nameNode != null && nameNode.getTextContent().contains(":")) { foundIssue = true; wrongMetaNodes.add(metaNode); validationReportBuilder.reportIssue( - ValidationReport.IssueLevel.error, + ValidationReport.IssueLevel.ERROR, "Error detected in meta node: name property contains a prefixed value.", metaNode ); diff --git a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java index 909a33a78..58e4f11aa 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java @@ -18,6 +18,7 @@ import java.util.List; +import org.apache.any23.extractor.html.DomUtils; import org.apache.any23.validator.DOMDocument; import org.apache.any23.validator.Fix; import org.apache.any23.validator.Rule; @@ -30,27 +31,37 @@ */ public class MissingItemscopeAttributeValueFix implements Fix { + private static final String EMPTY_ITEMSCOPE_VALUE = ""; + + private static final String ITEMSCOPE = "itemscope"; + /** * Default constructor */ public MissingItemscopeAttributeValueFix() { + //default constructor } - public static final String EMPTY_ITEMSCOPE_VALUE = "=\"itemscope\""; - + @Override public String getHRName() { return "missing-itemscope-value-fix"; } + @Override public void execute(Rule rule, @SuppressWarnings("rawtypes") RuleContext context, DOMDocument document) { - List itemNodes = document.getNodesWithAttribute("itemscope"); - for(Node itemNode : itemNodes) { - Node itemScopeNode = itemNode.getAttributes().getNamedItem("itemscope"); - if(itemScopeNode.getNodeValue().contentEquals("")) { - itemNode.getAttributes().getNamedItem("itemscope").setNodeValue(EMPTY_ITEMSCOPE_VALUE); + List itemScopeContainerElements = document.getNodesWithAttribute(ITEMSCOPE); + for(Node itemScopeContainerElement : itemScopeContainerElements) { + Node newItemScopeContainerElement = itemScopeContainerElement; + Node itemScopeNode = newItemScopeContainerElement.getAttributes().getNamedItem(ITEMSCOPE); + if (itemScopeNode.getTextContent() == null || itemScopeNode.getTextContent() == "") { + String node = DomUtils.getXPathForNode(itemScopeContainerElement); + document.addAttribute(node, ITEMSCOPE, EMPTY_ITEMSCOPE_VALUE); + //newItemScopeContainerElement.getAttributes().removeNamedItem(ITEMSCOPE); + //Attr newItemScopeNode = document.getOriginalDocument().createAttribute(ITEMSCOPE); + //newItemScopeNode.setNodeValue(EMPTY_ITEMSCOPE_VALUE); + //newItemScopeContainerElement.getAttributes().setNamedItem(newItemScopeNode); } } } - } diff --git a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueRule.java b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueRule.java index 67d44b244..415b2dcf9 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueRule.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueRule.java @@ -18,6 +18,7 @@ import java.util.List; +import org.apache.any23.extractor.html.DomUtils; import org.apache.any23.validator.DOMDocument; import org.apache.any23.validator.Rule; import org.apache.any23.validator.RuleContext; @@ -45,6 +46,7 @@ public class MissingItemscopeAttributeValueRule implements Rule { * Default constructor */ public MissingItemscopeAttributeValueRule() { + //default costructor } @Override @@ -60,7 +62,7 @@ public boolean applyOn(DOMDocument document, @SuppressWarnings("rawtypes") RuleC ValidationReportBuilder validationReportBuilder) { List itemNodes = document.getNodesWithAttribute("itemscope"); boolean foundPrecondition = false; - String propertyNode = null; + String propertyNode; Node iNode = null; for(Node itemNode : itemNodes) { iNode = itemNode; @@ -72,7 +74,7 @@ public boolean applyOn(DOMDocument document, @SuppressWarnings("rawtypes") RuleC } if(foundPrecondition) { validationReportBuilder.reportIssue( - ValidationReport.IssueLevel.error, + ValidationReport.IssueLevel.ERROR, "Located absence of an accompanying value for the the 'itemscope' attribute of element with hashcode: " + iNode.hashCode(), iNode ); diff --git a/core/src/main/java/org/apache/any23/validator/rule/MissingOpenGraphNamespaceRule.java b/core/src/main/java/org/apache/any23/validator/rule/MissingOpenGraphNamespaceRule.java index 8229525e6..3ab99f580 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MissingOpenGraphNamespaceRule.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MissingOpenGraphNamespaceRule.java @@ -35,10 +35,12 @@ */ public class MissingOpenGraphNamespaceRule implements Rule { + @Override public String getHRName() { return "missing-opengraph-namespace-rule"; } + @Override public boolean applyOn( DOMDocument document, @SuppressWarnings("rawtypes") RuleContext context, @@ -57,7 +59,7 @@ public boolean applyOn( Node htmlNode = document.getNode("/HTML"); if( htmlNode.getAttributes().getNamedItem("xmlns:og") == null) { validationReportBuilder.reportIssue( - ValidationReport.IssueLevel.error, + ValidationReport.IssueLevel.ERROR, "Missing OpenGraph namespace declaration.", htmlNode ); diff --git a/core/src/main/java/org/apache/any23/validator/rule/OpenGraphNamespaceFix.java b/core/src/main/java/org/apache/any23/validator/rule/OpenGraphNamespaceFix.java index 697599156..649259086 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/OpenGraphNamespaceFix.java +++ b/core/src/main/java/org/apache/any23/validator/rule/OpenGraphNamespaceFix.java @@ -21,6 +21,7 @@ import org.apache.any23.validator.Fix; import org.apache.any23.validator.Rule; import org.apache.any23.validator.RuleContext; +import org.apache.any23.vocab.OGP; /** * This fixes the missing Open Graph protocol. @@ -31,12 +32,14 @@ */ public class OpenGraphNamespaceFix implements Fix { - public static final String OPENGRAPH_PROTOCOL_NS = "http://opengraphprotocol.org/schema/"; + public static final String OPENGRAPH_PROTOCOL_NS = OGP.NS; + @Override public String getHRName() { return "opengraph-namespace-fix"; } + @Override public void execute(Rule rule, @SuppressWarnings("rawtypes") RuleContext context, DOMDocument document) { document.addAttribute("/HTML", "xmlns:og", OPENGRAPH_PROTOCOL_NS); } diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java index 3f11087df..32c49515e 100644 --- a/core/src/test/java/org/apache/any23/Any23Test.java +++ b/core/src/test/java/org/apache/any23/Any23Test.java @@ -343,7 +343,7 @@ public void testExtractionParameters() throws IOException, try { runner.extract( new ExtractionParameters(DefaultConfiguration.singleton(), - ValidationMode.None), source, compositeTH1); + ValidationMode.NONE), source, compositeTH1); } finally { compositeTH1.close(); } @@ -383,7 +383,7 @@ public void testExtractionParametersWithNestingDisabled() compositeTH1.addChild(ctw1); runner.extract( new ExtractionParameters(DefaultConfiguration.singleton(), - ValidationMode.None, true), source, compositeTH1); + ValidationMode.NONE, true), source, compositeTH1); compositeTH1.close(); logger.debug("Out1: " + baos.toString()); Assert.assertEquals("Unexpected number of triples.", @@ -397,7 +397,7 @@ public void testExtractionParametersWithNestingDisabled() compositeTH2.addChild(ctw2); runner.extract( new ExtractionParameters(DefaultConfiguration.singleton(), - ValidationMode.ValidateAndFix, false), source, + ValidationMode.VALIDATE_AND_FIX, false), source, compositeTH2); compositeTH2.close(); logger.debug("Out2: " + baos.toString()); @@ -558,7 +558,7 @@ private ExtractionReport detectAndExtract(String in) throws Exception { ReportingTripleHandler outputHandler = new ReportingTripleHandler( new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments( new NTriplesWriter(out)))); - return any23.extract(new ExtractionParameters(conf, ValidationMode.ValidateAndFix, null, null), + return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null), new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8"); } diff --git a/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java b/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java index c389f17c1..791767021 100644 --- a/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java +++ b/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java @@ -23,7 +23,6 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -77,23 +76,21 @@ public void testMissingOGNamespace() throws IOException, ValidatorException, URI logger.debug( validationReport.toString() ); } } - - @Ignore("Itemscope parsing issue") + @Test public void testMissingItemscopeAttributeValue() throws IOException, URISyntaxException, ValidatorException { DOMDocument document = loadDocument("microdata-basic.html"); - List brokenItemScopeNodes = document.getNodesWithAttribute("itemscope"); - for (Node node : brokenItemScopeNodes) { + List nullItemScopeNodes = document.getNodesWithAttribute("itemscope"); + for (Node node : nullItemScopeNodes) { // all nodes with itemscope have an empty string value Assert.assertEquals("", node.getAttributes().getNamedItem("itemscope").getNodeValue() ); } ValidationReport validationReport = validator.validate(document, true); List fixedItemScopeNodes = document.getNodesWithAttribute("itemscope"); for (Node node : fixedItemScopeNodes) { - // all nodes with itemscope now have a default value of "itemscope" + // all nodes with itemscope now have a default value of "" e.g. empty string Assert.assertNotNull(node.getAttributes().getNamedItem("itemscope").getNodeValue() ); - Assert.assertNotEquals("", node.getAttributes().getNamedItem("itemscope").getNodeValue() ); - Assert.assertEquals("itemscope", node.getAttributes().getNamedItem("itemscope").getNodeValue()); + Assert.assertEquals("", node.getAttributes().getNamedItem("itemscope").getNodeValue() ); } if(logger.isDebugEnabled()) { logger.debug( validationReport.toString() ); @@ -126,8 +123,8 @@ public void testAboutNotIRIRule() throws Exception { Assert.assertEquals( "Unexpected number of issues.", 1, validationReport.getIssues().size() ); } - private DOMDocument loadDocument(String document) throws IOException, URISyntaxException { - InputStream is = this.getClass().getResourceAsStream(document); + public static DOMDocument loadDocument(String document) throws IOException, URISyntaxException { + InputStream is = DefaultValidatorTest.class.getResourceAsStream(document); final String documentIRI = "http://test.com"; TagSoupParser tsp = new TagSoupParser(is, documentIRI); return new DefaultDOMDocument( new URI(documentIRI), tsp.getDOM() ); diff --git a/core/src/test/java/org/apache/any23/validator/XMLValidationReportSerializerTest.java b/core/src/test/java/org/apache/any23/validator/XMLValidationReportSerializerTest.java index 237d14f97..5c3683ef2 100644 --- a/core/src/test/java/org/apache/any23/validator/XMLValidationReportSerializerTest.java +++ b/core/src/test/java/org/apache/any23/validator/XMLValidationReportSerializerTest.java @@ -68,7 +68,7 @@ public void testSerialize() Document document = new DocumentImpl(); Element element = document.createElement("html"); - validationReportBuilder.reportIssue(ValidationReport.IssueLevel.info, "Test message", element); + validationReportBuilder.reportIssue(ValidationReport.IssueLevel.INFO, "Test message", element); validationReportBuilder.traceRuleActivation( new MetaNameMisuseRule() ); diff --git a/service/src/main/java/org/apache/any23/servlet/RedirectServlet.java b/service/src/main/java/org/apache/any23/servlet/RedirectServlet.java index ede383d9b..ea87e005b 100644 --- a/service/src/main/java/org/apache/any23/servlet/RedirectServlet.java +++ b/service/src/main/java/org/apache/any23/servlet/RedirectServlet.java @@ -17,10 +17,15 @@ package org.apache.any23.servlet; +import javax.servlet.RequestDispatcher; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; /** @@ -30,11 +35,22 @@ * @author Davide Palmisano ( palmisano@fbk.eu ) */ public class RedirectServlet extends HttpServlet { - + + private static final Logger LOG = LoggerFactory.getLogger(RedirectServlet.class); + + /** + * + */ + private static final long serialVersionUID = 1L; + @Override protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { - doGet(request, response); + try { + doGet(request, response); + } catch (ServletException | IOException e) { + LOG.error("Error executing GET request.", e); + } } @Override @@ -44,21 +60,36 @@ protected void doGet(HttpServletRequest request, HttpServletResponse response) final String pathInfo = request.getPathInfo(); final String queryString = request.getQueryString(); - if (("/".equals(pathInfo) && queryString == null)) { - getServletContext().getRequestDispatcher("/resources/form.html").forward(request, response); + if ("/".equals(pathInfo) && queryString == null) { + RequestDispatcher dispatcher = getServletContext().getRequestDispatcher("/resources/form.html"); + try { + dispatcher.forward(request, response); + } catch (ServletException | IOException e) { + LOG.error("Error in request dispatcher forwarding.", e); + } return; } // forward requests to /resources/* to the default servlet, this is // where we can put static files if (pathInfo.startsWith("/resources/")) { - getServletContext().getNamedDispatcher("default").forward(request, response); + RequestDispatcher dispatcher = getServletContext().getNamedDispatcher("default"); + try { + dispatcher.forward(request, response); + } catch (ServletException | IOException e) { + LOG.error("Error in named request dispatcher forwarding.", e); + } return; } - response.sendRedirect( - request.getContextPath() + "/any23" + - request.getPathInfo() + - (queryString == null ? "" : "?" + queryString) - ); + try { + response.sendRedirect( + request.getContextPath() + "/any23" + + request.getPathInfo() + + (queryString == null ? "" : "?" + queryString) + ); + } catch (IOException e) { + LOG.error("Error in sending HttpServletResponse Redirect.", e); + } + } } diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java index b93662e3f..b63d052e0 100644 --- a/service/src/main/java/org/apache/any23/servlet/Servlet.java +++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java @@ -56,11 +56,11 @@ public class Servlet extends HttpServlet { private static final long serialVersionUID = 8207685628715421336L; - private final static Pattern schemeAndSingleSlashRegex = + private static final Pattern schemeAndSingleSlashRegex = Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]"); // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) - private final static Pattern schemeRegex = + private static final Pattern schemeRegex = Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:"); @Override @@ -142,8 +142,6 @@ private String getFormatFromRequestOrNegotiation(HttpServletRequest request) { MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept")); if (result == null) { return null; - } else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) { - return "turtle"; } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) { return "n3"; } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) { @@ -155,7 +153,7 @@ private String getFormatFromRequestOrNegotiation(HttpServletRequest request) { } else if (RDFFormat.JSONLD.hasMIMEType(result.getMediaType())) { return "ld+json"; } else { - return "turtle"; // shouldn't happen + return "turtle"; // shouldn't happen however default is turtle } } @@ -260,13 +258,13 @@ private ValidationMode getValidationMode(HttpServletRequest request) { final String parameter = "validation-mode"; final String validationMode = request.getParameter(parameter); if (validationMode == null) - return ValidationMode.None; + return ValidationMode.NONE; if ("none".equalsIgnoreCase(validationMode)) - return ValidationMode.None; + return ValidationMode.NONE; if ("validate".equalsIgnoreCase(validationMode)) - return ValidationMode.Validate; + return ValidationMode.VALIDATE; if ("validate-fix".equalsIgnoreCase(validationMode)) - return ValidationMode.ValidateAndFix; + return ValidationMode.VALIDATE_AND_FIX; throw new IllegalArgumentException( String.format("Invalid value '%s' for '%s' parameter.", validationMode, parameter) ); diff --git a/service/src/main/java/org/apache/any23/servlet/WebResponder.java b/service/src/main/java/org/apache/any23/servlet/WebResponder.java index 3101e0900..5b1607094 100644 --- a/service/src/main/java/org/apache/any23/servlet/WebResponder.java +++ b/service/src/main/java/org/apache/any23/servlet/WebResponder.java @@ -329,7 +329,7 @@ private boolean initRdfWriter(String format, boolean report, boolean annotate) t FormatWriter fw = factory.getRdfWriter(byteOutStream); fw.setAnnotated(annotate); outputMediaType = factory.getMimeType(); - List tripleHandlers = new ArrayList(); + List tripleHandlers = new ArrayList<>(); tripleHandlers.add(new IgnoreAccidentalRDFa(fw)); tripleHandlers.add(new CountingTripleHandler()); rdfWriter = new CompositeTripleHandler(tripleHandlers); diff --git a/test-resources/src/test/resources/microdata/microdata-basic.html b/test-resources/src/test/resources/microdata/microdata-basic.html index 3ffca8425..e7d4dbab3 100644 --- a/test-resources/src/test/resources/microdata/microdata-basic.html +++ b/test-resources/src/test/resources/microdata/microdata-basic.html @@ -19,70 +19,87 @@ - -
-

My name is Elizabeth.

-
- -
-

My name is Daniel.

-
+ +
+

+ My name is Elizabeth. +

+
+ +
+

+ My name is Daniel. +

+
- -
-

My name is Neil.

-

My band is called Four Parts Water.

-

I am British.

-
+ +
+

+ My name is Neil. +

+

+ My band is called Four Parts Water. +

+

+ I am British. +

+
- -
+ +
Google -
+
- -
- I was born on . -
+ +
+ I was born on + + . +
- -
+ +

Flavors in my favorite ice cream:

    -
  • Lemon sorbet
  • -
  • Apricot sorbet
  • +
  • Lemon sorbet
  • +
  • Apricot sorbet
-
+
- -
+ +
orange -
+
- -
+ +
-
The Castle (1986)
-
+
+ + The Castle + + (1986) +
+
- - -
+ + +
The Castle (1986)
-
+
- -
+ +

Hedral

Hedral is a male american domestic shorthair, - with a fluffy black fur with white paws and belly.

- -
+ with a fluffy black fur with white paws and belly.

+ +
- -
+
Title
The Reality Dysfunction @@ -90,17 +107,21 @@

Hedral

Publication date
- -
+ +
- -
+ +

Hedral

-

Hedral is a male american domestic shorthair, with a fluffy - black fur with - white paws and belly.

- -
+

+ Hedral is a male american domestic shorthair, with a fluffy black fur with white paws and belly. +

+ +
From 15571d45f89e8c63b8da6a699b345131d4433ad9 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 30 Dec 2017 17:08:41 +0000 Subject: [PATCH 2/2] ANY23-318 ExtractionException handling in BaseRDFExtractor.java kills entire extraction --- .../apache/any23/validator/DefaultValidator.java | 13 ++++++++++--- .../rule/MissingItemscopeAttributeValueFix.java | 6 +----- .../any23/validator/DefaultValidatorTest.java | 4 ++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/any23/validator/DefaultValidator.java b/core/src/main/java/org/apache/any23/validator/DefaultValidator.java index 6ad2509d5..097a346dc 100644 --- a/core/src/main/java/org/apache/any23/validator/DefaultValidator.java +++ b/core/src/main/java/org/apache/any23/validator/DefaultValidator.java @@ -46,11 +46,12 @@ public class DefaultValidator implements Validator { private List> rulesOrder; public DefaultValidator() { - rulesToFixes = new HashMap, List>>(); - rulesOrder = new ArrayList>(); + rulesToFixes = new HashMap<>(); + rulesOrder = new ArrayList<>(); loadDefaultRules(); } + @Override public ValidationReport validate(DOMDocument document, boolean applyFix) throws ValidatorException { final ValidationReportBuilder validationReportBuilder = new DefaultValidationReportBuilder(); @@ -81,15 +82,17 @@ public ValidationReport validate(DOMDocument document, boolean applyFix) return validationReportBuilder.getReport(); } + @Override public ValidationReport validate(URI documentIRI, Document document, boolean applyFix) throws ValidatorException { return validate( new DefaultDOMDocument(documentIRI, document), applyFix ); } + @Override public synchronized void addRule(Class rule, Class fix) { List> fixes = rulesToFixes.get(rule); if(fixes == null) { - fixes = new ArrayList>(); + fixes = new ArrayList<>(); } rulesOrder.add(rule); rulesToFixes.put(rule, fixes); @@ -98,19 +101,23 @@ public synchronized void addRule(Class rule, Class rule) { addRule(rule, null); } + @Override public synchronized void removeRule(Class rule) { rulesOrder.remove(rule); rulesToFixes.remove(rule); } + @Override public List> getAllRules() { return Collections.unmodifiableList(rulesOrder); } + @Override public List> getFixes(Class rule) { List> fixes = rulesToFixes.get(rule); return fixes == null diff --git a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java index 58e4f11aa..45fa21042 100644 --- a/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java +++ b/core/src/main/java/org/apache/any23/validator/rule/MissingItemscopeAttributeValueFix.java @@ -31,7 +31,7 @@ */ public class MissingItemscopeAttributeValueFix implements Fix { - private static final String EMPTY_ITEMSCOPE_VALUE = ""; + private static final String EMPTY_ITEMSCOPE_VALUE = "itemscope"; private static final String ITEMSCOPE = "itemscope"; @@ -57,10 +57,6 @@ public void execute(Rule rule, @SuppressWarnings("rawtypes") RuleContext context if (itemScopeNode.getTextContent() == null || itemScopeNode.getTextContent() == "") { String node = DomUtils.getXPathForNode(itemScopeContainerElement); document.addAttribute(node, ITEMSCOPE, EMPTY_ITEMSCOPE_VALUE); - //newItemScopeContainerElement.getAttributes().removeNamedItem(ITEMSCOPE); - //Attr newItemScopeNode = document.getOriginalDocument().createAttribute(ITEMSCOPE); - //newItemScopeNode.setNodeValue(EMPTY_ITEMSCOPE_VALUE); - //newItemScopeContainerElement.getAttributes().setNamedItem(newItemScopeNode); } } } diff --git a/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java b/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java index 791767021..13f393b14 100644 --- a/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java +++ b/core/src/test/java/org/apache/any23/validator/DefaultValidatorTest.java @@ -88,9 +88,9 @@ public void testMissingItemscopeAttributeValue() throws IOException, URISyntaxEx ValidationReport validationReport = validator.validate(document, true); List fixedItemScopeNodes = document.getNodesWithAttribute("itemscope"); for (Node node : fixedItemScopeNodes) { - // all nodes with itemscope now have a default value of "" e.g. empty string + // all nodes with itemscope now have a default value of "itemscope" Assert.assertNotNull(node.getAttributes().getNamedItem("itemscope").getNodeValue() ); - Assert.assertEquals("", node.getAttributes().getNamedItem("itemscope").getNodeValue() ); + Assert.assertEquals("itemscope", node.getAttributes().getNamedItem("itemscope").getNodeValue() ); } if(logger.isDebugEnabled()) { logger.debug( validationReport.toString() );