From f23c25cc23938aa27551426d38dd0139fd30b9f4 Mon Sep 17 00:00:00 2001 From: Hans Date: Wed, 24 Oct 2018 10:35:10 -0500 Subject: [PATCH] ANY23-405 Parse microdata property values correctly --- .../extractor/microdata/ItemPropValue.java | 27 +++++ .../microdata/MicrodataExtractor.java | 6 +- .../extractor/microdata/MicrodataParser.java | 98 ++++++++++++++++--- .../java/org/apache/any23/rdf/RDFUtils.java | 10 +- ...odata-nested-url-resolving-expected.nquads | 2 +- 5 files changed, 120 insertions(+), 23 deletions(-) diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java index b4710ded2..2b6659a0f 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java @@ -25,6 +25,8 @@ import java.util.Objects; import org.apache.any23.util.StringUtils; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.vocabulary.XMLSchema; /** * Describes a possible value for a Microdata item property. @@ -97,6 +99,31 @@ public ItemPropValue(Object content, Type type) { this.content = type.checkClass(content); } + ItemPropValue(Literal literal) { + this.literal = literal; + + Type type; + Object content; + + //for backwards compatibility: + if (XMLSchema.DATE.equals(literal.getDatatype()) || XMLSchema.DATETIME.equals(literal.getDatatype())) { + try { + content = parseDateTime(literal.getLabel()); + type = Type.Date; + } catch (Exception e) { + content = literal.getLabel(); + type = Type.Plain; + } + } else { + content = literal.getLabel(); + type = Type.Plain; + } + this.type = type; + this.content = content; + } + + Literal literal; + /** * @return the content object. */ diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index 3b45dd48a..d49f7ce59 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -33,6 +33,8 @@ import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.vocabulary.RDF; import org.eclipse.rdf4j.model.vocabulary.XMLSchema; import org.w3c.dom.Document; @@ -477,7 +479,9 @@ private void processProperty( Value value; Object propValue = itemProp.getValue().getContent(); ItemPropValue.Type propType = itemProp.getValue().getType(); - if (propType.equals(ItemPropValue.Type.Nested)) { + if (itemProp.getValue().literal != null) { + value = itemProp.getValue().literal; + } else if (propType.equals(ItemPropValue.Type.Nested)) { value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace); } else if (propType.equals(ItemPropValue.Type.Plain)) { value = RDFUtils.literal((String) propValue, documentLanguage); diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index f30562038..970c31bbf 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -17,7 +17,11 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.html.DomUtils; +import org.apache.any23.rdf.RDFUtils; import org.apache.commons.lang.StringUtils; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; +import org.eclipse.rdf4j.model.vocabulary.XMLSchema; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -315,8 +319,51 @@ public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException return itemPropValue; final String nodeName = node.getNodeName().toLowerCase(); + + //see http://w3c.github.io/microdata-rdf/#dfn-property-values + if ("data".equals(nodeName) || "meter".equals(nodeName)) { + String value = StringUtils.stripToEmpty(readContentAttribute(node, "value")); + Literal l; + if (XMLDatatypeUtil.isValidInteger(value)) { + l = RDFUtils.literal(value, XMLSchema.INTEGER); + } else if (XMLDatatypeUtil.isValidDouble(value)) { + l = RDFUtils.literal(value, XMLSchema.DOUBLE); + } else { + l = RDFUtils.literal(value); + } + return new ItemPropValue(l); + } + if( "time".equals(nodeName) ) { + String dateTimeStr = StringUtils.stripToEmpty(readContentAttribute(node, "datetime")); + Literal l; + if (XMLDatatypeUtil.isValidDate(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DATE); + } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.TIME); + } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DATETIME); + } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEARMONTH); + } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEAR); + } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DURATION); + } else { + String lang = getLanguage(node); + if (lang != null) { + l = RDFUtils.literal(dateTimeStr, lang); + } else { + l = RDFUtils.literal(dateTimeStr); + } + } + return new ItemPropValue(l); + } + if (DomUtils.hasAttribute(node, "content")) { - return new ItemPropValue(DomUtils.readAttribute(node, "content"), ItemPropValue.Type.Plain); + String val = DomUtils.readAttribute(node, "content"); + String lang = getLanguage(node); + Literal l = lang == null ? RDFUtils.literal(val) : RDFUtils.literal(val, lang); + return new ItemPropValue(l); } if( SRC_TAGS.contains(nodeName) ) { @@ -329,29 +376,50 @@ public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException if( "object".equals(nodeName) ) { return new ItemPropValue( DomUtils.readAttribute(node, "data"), ItemPropValue.Type.Link); } - if( "time".equals(nodeName) ) { - final String dateTimeStr = DomUtils.readAttribute(node, "datetime"); - final Date dateTime; - try { - dateTime = ItemPropValue.parseDateTime(dateTimeStr); - } catch (ParseException pe) { - throw new MicrodataParserException( - String.format("Invalid format for datetime '%s'", dateTimeStr), - node - ); - } - return new ItemPropValue(dateTime, ItemPropValue.Type.Date); - } if( isItemScope(node) ) { return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested); } - final ItemPropValue newItemPropValue = new ItemPropValue( node.getTextContent(), ItemPropValue.Type.Plain); + String lang = getLanguage(node); + Literal l = lang == null ? RDFUtils.literal(node.getTextContent()) : RDFUtils.literal(node.getTextContent(), lang); + final ItemPropValue newItemPropValue = new ItemPropValue(l); itemPropValues.put(node, newItemPropValue); return newItemPropValue; } + private static String readContentAttribute(Node node, String attrName) { + NamedNodeMap attributes = node.getAttributes(); + if (attributes != null) { + Node attr = attributes.getNamedItem("content"); + if (attr != null) { + return attr.getNodeValue(); + } + attr = attributes.getNamedItem(attrName); + if (attr != null) { + return attr.getNodeValue(); + } + } + return node.getTextContent(); + } + + //see https://www.w3.org/TR/html52/dom.html#the-lang-and-xmllang-attributes + private static String getLanguage(Node node) { + String lang; + do { + lang = DomUtils.readAttribute(node, "xml:lang", null); + if (StringUtils.isNotBlank(lang)) { + return lang.trim(); + } + lang = DomUtils.readAttribute(node, "lang", null); + if (StringUtils.isNotBlank(lang)) { + return lang.trim(); + } + node = node.getParentNode(); + } while (node != null); + return null; + } + /** * Returns all the itemprops for the given itemscope node. * diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java index 44a98e09a..552d61f05 100644 --- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java +++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java @@ -274,9 +274,8 @@ public static Literal literal(double d) { /** * Creates a {@link Literal}. - * @param s string representation of the base namespace for the - * {@link org.eclipse.rdf4j.model.Literal} - * @param l the local name to associate with the namespace. + * @param s the literal's label + * @param l the literal's language * @return valid {@link org.eclipse.rdf4j.model.Literal} */ public static Literal literal(String s, String l) { @@ -290,9 +289,8 @@ public static Literal literal(String s, String l) { /** * Creates a {@link Literal}. - * @param s string representation of the base namespace for the - * {@link org.eclipse.rdf4j.model.Literal} - * @param datatype the datatype to associate with the namespace. + * @param s the literal's label + * @param datatype the literal's datatype * @return valid {@link org.eclipse.rdf4j.model.Literal} */ public static Literal literal(String s, org.eclipse.rdf4j.model.IRI datatype) { diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads index 0eb4bcffe..0cff257cc 100644 --- a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads +++ b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads @@ -17,7 +17,7 @@ _:node1causocqkx2 . _:node1causocqkx2 "Solution-based problem-solving restricts the result before the start."@en . -_:node1causocqkx2 "2013-07-30"^^ . +_:node1causocqkx2 "2013-07-30T20:30:00+02:00"^^ . _:node1causocqkx3 . _:node1causocqkx3 "Ruben"@en . _:node1causocqkx3 "Verborgh"@en .