From b4defc1e67752d0b809c02000a815df688f12a48 Mon Sep 17 00:00:00 2001 From: jabail Date: Thu, 20 Apr 2017 11:54:15 -0400 Subject: [PATCH] SLING-6783 updates for org.apache.commons.html --- contrib/commons/html/NOTICE | 2 +- contrib/commons/html/README.md | 39 ++++++++++++++++++ contrib/commons/html/pom.xml | 14 +++---- .../commons/html/impl/HtmlParserImpl.java | 40 +++++++++++++++++-- 4 files changed, 83 insertions(+), 12 deletions(-) create mode 100644 contrib/commons/html/README.md diff --git a/contrib/commons/html/NOTICE b/contrib/commons/html/NOTICE index be0c7d1210..92f43bf55f 100644 --- a/contrib/commons/html/NOTICE +++ b/contrib/commons/html/NOTICE @@ -8,4 +8,4 @@ This product includes software developed at The Apache Software Foundation (http://www.apache.org/). This product includes software developed at -http://home.ccil.org/~cowan/XML/tagsoup/ \ No newline at end of file +http://vrici.lojban.org/~cowan/XML/tagsoup/ \ No newline at end of file diff --git a/contrib/commons/html/README.md b/contrib/commons/html/README.md new file mode 100644 index 0000000000..15893118dd --- /dev/null +++ b/contrib/commons/html/README.md @@ -0,0 +1,39 @@ +# current settings and their default values + +* http://xml.org/sax/features/namespaces=true +* http://xml.org/sax/features/namespace-prefixes=false +* http://xml.org/sax/features/external-general-entities=false +* http://xml.org/sax/features/external-parameter-entities=false +* http://xml.org/sax/features/is-standalone=false +* http://xml.org/sax/features/lexical-handler/parameter-entities=false +* http://xml.org/sax/features/resolve-dtd-uris=true +* http://xml.org/sax/features/string-interning=true +* http://xml.org/sax/features/use-attributes2=false +* http://xml.org/sax/features/use-locator2=false +* http://xml.org/sax/features/use-entity-resolver2=false +* http://xml.org/sax/features/validation=false +* http://xml.org/sax/features/xmlns-uris=false +* http://xml.org/sax/features/xmlns-uris=false +* http://xml.org/sax/features/xml-1.1=false + +default SAX features are defined here +http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html + +tagsoup specific features are + +* http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons=false + A value of "true" indicates that the parser will ignore unknown elements. +* http://www.ccil.org/~cowan/tagsoup/features/bogons-empty=false + A value of "true" indicates that the parser will give unknown elements a content model of EMPTY; a value of "false", a content model of ANY. +* http://www.ccil.org/~cowan/tagsoup/features/root-bogons=true + A value of "true" indicates that the parser will allow unknown elements to be the root of the output document. +* http://www.ccil.org/~cowan/tagsoup/features/default-attributes=true + A value of "true" indicates that the parser will return default attribute values for missing attributes that have default values. +* http://www.ccil.org/~cowan/tagsoup/features/translate-colons=false + A value of "true" indicates that the parser will translate colons into underscores in names. +* http://www.ccil.org/~cowan/tagsoup/features/restart-elements=true + A value of "true" indicates that the parser will attempt to restart the restartable elements. +* http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace=false + A value of "true" indicates that the parser will transmit whitespace in element-only content via the SAX ignorableWhitespace callback. Normally this is not done, because HTML is an SGML application and SGML suppresses such whitespace. +* http://www.ccil.org/~cowan/tagsoup/features/cdata-elements=true + A value of "true" indicates that the parser will process the script and style elements (or any elements with type='cdata' in the TSSL schema) as SGML CDATA elements (that is, no markup is recognized except the matching end-tag). diff --git a/contrib/commons/html/pom.xml b/contrib/commons/html/pom.xml index 97eddd93f8..f022b61161 100644 --- a/contrib/commons/html/pom.xml +++ b/contrib/commons/html/pom.xml @@ -23,7 +23,7 @@ org.apache.sling sling - 26 + 30 @@ -44,10 +44,6 @@ - - org.apache.felix - maven-scr-plugin - org.apache.felix maven-bundle-plugin @@ -85,11 +81,13 @@ org.ccil.cowan.tagsoup tagsoup - 1.2 + 1.2.1 - org.apache.felix - org.apache.felix.scr.annotations + org.apache.sling + org.apache.sling.commons.osgi + 2.2.0 + provided diff --git a/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java b/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java index d3cd9b860e..3c3866bcff 100644 --- a/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java +++ b/contrib/commons/html/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java @@ -20,11 +20,16 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Map; -import org.apache.felix.scr.annotations.Component; -import org.apache.felix.scr.annotations.Service; import org.apache.sling.commons.html.HtmlParser; +import org.apache.sling.commons.osgi.PropertiesUtil; import org.ccil.cowan.tagsoup.Parser; +import org.osgi.service.component.annotations.Activate; +import org.osgi.service.component.annotations.Component; +import org.osgi.service.metatype.annotations.AttributeDefinition; +import org.osgi.service.metatype.annotations.Designate; +import org.osgi.service.metatype.annotations.ObjectClassDefinition; import org.w3c.dom.Document; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; @@ -32,8 +37,19 @@ import org.xml.sax.ext.LexicalHandler; @Component -@Service(value=HtmlParser.class) +@Designate(ocd = HtmlParserImpl.Config.class) public class HtmlParserImpl implements HtmlParser { + + @ObjectClassDefinition(name="Apache Sling HTML Parser", description="Parser configuration") + static @interface Config { + + @AttributeDefinition(name = "Parser Properties", + description = "Additional properties to be applied to the underlying parser in the format of key=[true|false]") + String[] properties(); + + } + + private Map features; /** * @see org.apache.sling.commons.html.HtmlParser#parse(java.io.InputStream, java.lang.String, org.xml.sax.ContentHandler) @@ -44,6 +60,11 @@ public void parse(InputStream stream, String encoding, ContentHandler ch) if ( ch instanceof LexicalHandler ) { parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch); } + if (!features.isEmpty()){ + for (String feature:features.keySet()){ + parser.setProperty(feature, features.get(feature)); + } + } parser.setContentHandler(ch); final InputSource source = new InputSource(stream); source.setEncoding(encoding); @@ -68,6 +89,11 @@ public Document parse(String systemId, InputStream stream, String encoding) thro try { parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder); + if (!features.isEmpty()){ + for (String feature : features.keySet()) { + parser.setProperty(feature, features.get(feature)); + } + } parser.setContentHandler(builder); parser.parse(source); } catch (SAXException se) { @@ -78,4 +104,12 @@ public Document parse(String systemId, InputStream stream, String encoding) thro } return builder.getDocument(); } + + @Activate + private void activate(Config config) { + Map temp = PropertiesUtil.toMap(config.properties(), new String[]{}); + for (String key : temp.keySet()){ + features.put(key, Boolean.valueOf(temp.get(key))); + } + } }