From 1ffd60a68d20e7f313745202c012ac3fd2b49738 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Wed, 15 Feb 2017 23:17:40 -0800 Subject: [PATCH] ANY23-304 Add extractor for OpenIE 1st pass --- .../java/org/apache/any23/vocab/YAML.java | 70 +- core/pom.xml | 5 +- .../extractor/html/HTMLMetaExtractor.java | 35 +- .../extractor/html/TurtleHTMLExtractor.java | 2 +- .../extractor/openie/OpenIEExtractor.java | 111 +++ .../openie/OpenIEExtractorFactory.java | 53 ++ .../any23/extractor/openie/package-info.java | 24 + .../any23/extractor/xpath/XPathExtractor.java | 9 +- .../any23/extractor/yaml/YAMLExtractor.java | 18 +- .../extractor/yaml/YAMLExtractorFactory.java | 29 +- .../java/org/apache/any23/rdf/RDFUtils.java | 21 +- .../org/apache/any23/util/StreamUtils.java | 74 +- .../extractor/openie/example-openie.html | 631 ++++++++++++++++++ .../extractor/openie/OpenIEExtractorTest.java | 88 +++ .../any23/extractor/openie/package-info.java | 17 + pom.xml | 5 + .../extractor/openie/example-openie.html | 631 ++++++++++++++++++ 17 files changed, 1727 insertions(+), 96 deletions(-) create mode 100644 core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/openie/package-info.java create mode 100644 core/src/main/resources/org/apache/any23/extractor/openie/example-openie.html create mode 100644 core/src/test/java/org/apache/any23/extractor/openie/OpenIEExtractorTest.java create mode 100644 core/src/test/java/org/apache/any23/extractor/openie/package-info.java create mode 100644 test-resources/src/test/resources/org/apache/any23/extractor/openie/example-openie.html diff --git a/api/src/main/java/org/apache/any23/vocab/YAML.java b/api/src/main/java/org/apache/any23/vocab/YAML.java index 41df32895..ffca1e61e 100644 --- a/api/src/main/java/org/apache/any23/vocab/YAML.java +++ b/api/src/main/java/org/apache/any23/vocab/YAML.java @@ -24,52 +24,52 @@ */ public class YAML extends Vocabulary { - /* - * Namespace of YAML vocabulary - */ - public static final String NS = "http://yaml.org/spec/1.2/spec.html#"; + /* + * Namespace of YAML vocabulary + */ + public static final String NS = "http://yaml.org/spec/1.2/spec.html#"; - public static final String PREFIX = "yaml"; + public static final String PREFIX = "yaml"; - public static final String ROOT = "Root"; + public static final String ROOT = "Root"; - public static final String DOCUMENT = "Document"; + public static final String DOCUMENT = "Document"; - public static final String NODE = "Node"; + public static final String NODE = "Node"; - public static final String CONTAINS = "contains"; + public static final String CONTAINS = "contains"; - private static final YAML _instance = new YAML(); + private static final YAML _instance = new YAML(); - private YAML() { - super(NS); - } + private YAML() { + super(NS); + } - public static YAML getInstance() { - return _instance; - } + public static YAML getInstance() { + return _instance; + } - /** - *

The root node. Representation of the YAML file. NB: one file may contain more than one documents - * represented by nodes; e.g.

- *

- * - * %YAML 1.2 - * --- - * - data1 - * - data2 - * --- - * - data3 - * - *

- * Contains two documents. - */ - public final IRI root = createProperty(NS, ROOT); + /** + *

The root node. Representation of the YAML file. NB: one file may contain more than one documents + * represented by nodes; e.g.

+ *

+ * + * %YAML 1.2 + * --- + * - data1 + * - data2 + * --- + * - data3 + * + *

+ * Contains two documents. + */ + public final IRI root = createProperty(NS, ROOT); - public final IRI document = createProperty(NS, DOCUMENT); + public final IRI document = createProperty(NS, DOCUMENT); - public final IRI node = createProperty(NS, NODE); + public final IRI node = createProperty(NS, NODE); - public final IRI contains = createProperty(NS, CONTAINS); + public final IRI contains = createProperty(NS, CONTAINS); } diff --git a/core/pom.xml b/core/pom.xml index f03c67284..150a5f071 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -83,7 +83,10 @@ snakeyaml 1.17 - + + edu.washington.cs.knowitall.openie + openie_2.10 + org.apache.tika diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java index e67ec42d9..7cd8a39be 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java @@ -51,7 +51,7 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { private IRI profile; - private Map prefixes = new HashMap(); + private Map prefixes = new HashMap<>(); private String documentLang; @@ -82,29 +82,29 @@ public void run( lang = meta.getLang(); } if(meta.isPragmaDirective){ - if(lang != null) { - out.writeTriple( + if(lang != null) { + out.writeTriple( documentIRI, meta.getHttpEquiv(), SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang)); - } else { + } else { out.writeTriple( documentIRI, meta.getHttpEquiv(), SimpleValueFactory.getInstance().createLiteral(meta.getContent())); - } + } }else { - if(lang != null) { - out.writeTriple( + if(lang != null) { + out.writeTriple( documentIRI, meta.getName(), SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang)); - } else { - out.writeTriple( + } else { + out.writeTriple( documentIRI, meta.getName(), SimpleValueFactory.getInstance().createLiteral(meta.getContent())); - } + } } } } @@ -117,7 +117,7 @@ public void run( */ private String getDocumentLanguage(Document in) { String lang = DomUtils.find(in, "string(/HTML/@lang)"); - if (lang.equals("")) { + if ("".equals(lang)) { return null; } return lang; @@ -125,7 +125,7 @@ private String getDocumentLanguage(Document in) { private IRI extractProfile(Document in) { String profile = DomUtils.find(in, "string(/HTML/@profile)"); - if (profile.equals("")) { + if ("".equals(profile)) { return null; } return SimpleValueFactory.getInstance().createIRI(profile); @@ -150,7 +150,7 @@ private void extractLinkDefinedPrefixes(Document in) { private Set extractMetaElement(Document in, String baseProfile) { List metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META"); - Set result = new HashSet(); + Set result = new HashSet<>(); for (Node metaNode : metaNodes) { NamedNodeMap attributes = metaNode.getAttributes(); Node nameAttribute = attributes.getNamedItem("name"); @@ -281,12 +281,15 @@ public void setContent(String content) { @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; Meta meta = (Meta) o; - if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false; + if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) + return false; return true; } diff --git a/core/src/main/java/org/apache/any23/extractor/html/TurtleHTMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/TurtleHTMLExtractor.java index 17b54e681..1ad8dfae8 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/TurtleHTMLExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/TurtleHTMLExtractor.java @@ -82,7 +82,7 @@ public ExtractorDescription getDescription() { * @param ns the list of script nodes. */ private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List ns) { - if(ns.size() > 0 && turtleParser == null) { + if(!ns.isEmpty() && turtleParser == null) { turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er); } for(Node n : ns) { diff --git a/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java b/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java new file mode 100644 index 000000000..686ecad48 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.openie; + +import java.io.IOException; +import java.util.List; + +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactoryConfigurationError; + +import org.apache.any23.extractor.Extractor; +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.util.StreamUtils; +import org.eclipse.rdf4j.model.IRI; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; + +import edu.knowitall.openie.Argument; +import edu.knowitall.openie.Instance; +import edu.knowitall.openie.OpenIE; +import edu.knowitall.tool.parse.ClearParser; +import edu.knowitall.tool.postag.ClearPostagger; +import edu.knowitall.tool.srl.ClearSrl; +import edu.knowitall.tool.tokenize.ClearTokenizer; +import scala.collection.JavaConversions; +import scala.collection.Seq; + + + +/** + * An OpenIE + * extractor able to generate RDF statements from + * sentences representing relations in the text. + */ +public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor { + + private final Logger LOG = LoggerFactory.getLogger(getClass()); + + private IRI documentRoot; + + /** + * default constructor + */ + OpenIEExtractor() { + // default constructor + } + + /** + * @see org.apache.any23.extractor.Extractor#getDescription() + */ + @Override + public ExtractorDescription getDescription() { + return OpenIEExtractorFactory.getDescriptionInstance(); + } + + @Override + public void run(ExtractionParameters extractionParameters, + ExtractionContext context, Document in, ExtractionResult out) + throws IOException, ExtractionException { + OpenIE openIE = new OpenIE(new ClearParser(new ClearPostagger(new ClearTokenizer())), new ClearSrl(), false, false); + + + Seq extractions = null; + try { + extractions = openIE.extract(StreamUtils.asString(StreamUtils.documentToInputStream(in))); + } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { + LOG.error("Error during extraction: {}", e); + } + + List listExtractions = JavaConversions.seqAsJavaList(extractions); + for(Instance instance : listExtractions) { + StringBuilder sb = new StringBuilder(); + + sb.append(instance.confidence()) + .append('\t') + .append(instance.extr().context()) + .append('\t') + .append(instance.extr().arg1().text()) + .append('\t') + .append(instance.extr().rel().text()) + .append('\t'); + + List listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s()); + for(Argument argument : listArg2s) { + sb.append(argument.text()).append("; "); + } + System.out.println(sb.toString()); + } + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java new file mode 100644 index 000000000..161029233 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.openie; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorFactory extends SimpleExtractorFactory + implements ExtractorFactory { + + public static final String NAME = "openie"; + + public static final Prefixes prefixes = null; + + private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory(); + + public OpenIEExtractorFactory() { + super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html"); + } + + @Override + public OpenIEExtractor createExtractor() { + return new OpenIEExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + + +} diff --git a/core/src/main/java/org/apache/any23/extractor/openie/package-info.java b/core/src/main/java/org/apache/any23/extractor/openie/package-info.java new file mode 100644 index 000000000..38683c559 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/openie/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package provides an + * OpenIE + * extractor able to generate RDF statements from + * sentences representing relations in the text. + */ +package org.apache.any23.extractor.openie; \ No newline at end of file diff --git a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java index b04533ce5..33b8a2792 100644 --- a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java @@ -39,11 +39,12 @@ */ public class XPathExtractor implements Extractor.TagSoupDOMExtractor { - private final List xPathExtractionRules = new ArrayList(); + private final List xPathExtractionRules = new ArrayList<>(); public XPathExtractor() { + //default constructor } - + public XPathExtractor(List rules) { xPathExtractionRules.addAll(rules); } @@ -66,8 +67,8 @@ public void run( ExtractionContext extractionContext, Document in, ExtractionResult out - ) - throws IOException, ExtractionException { + ) + throws IOException, ExtractionException { final IRI documentIRI = extractionContext.getDocumentIRI(); for(XPathExtractionRule rule : xPathExtractionRules) { if(rule.acceptIRI(documentIRI)) { diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java index 64548f198..a4c22bf01 100644 --- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java @@ -17,8 +17,6 @@ import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -31,7 +29,6 @@ import org.apache.any23.rdf.RDFUtils; import org.apache.any23.util.StringUtils; import org.apache.any23.vocab.YAML; -import org.apache.commons.lang.WordUtils; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; @@ -58,6 +55,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor { @Override public void setStopAtFirstError(boolean f) { + //empty } @Override @@ -65,9 +63,9 @@ public void run(ExtractionParameters extractionParameters, ExtractionContext con ExtractionResult out) throws IOException, ExtractionException { IRI documentURI = context.getDocumentIRI(); - documentRoot = RDFUtils.uri(documentURI.toString() + "root"); + documentRoot = RDFUtils.iri(documentURI.toString() + "root"); - log.debug("process: {}", documentURI.toString()); + log.debug("Processing: {}", documentURI.toString()); out.writeNamespace(vocab.PREFIX, vocab.NS); out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE); out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE); @@ -99,9 +97,9 @@ private Value buildNode(IRI fileURI, Object treeData, ExtractionResult out) { if (treeData == null) { return RDF.NIL; } else if (treeData instanceof Map) { - return processMap(fileURI, (Map) treeData, out); + return processMap(fileURI, (Map) treeData, out); } else if (treeData instanceof List) { - return processList(fileURI, (List) treeData, out); + return processList(fileURI, (List) treeData, out); } else if (treeData instanceof Long) { return RDFUtils.literal(((Long) treeData)); } else if (treeData instanceof Integer) { @@ -132,13 +130,13 @@ private Value processMap(IRI file, Map node, ExtractionResult ou return nodeURI; } - private Value processList(IRI fileURI, Iterable iter, ExtractionResult out) { + private Value processList(IRI fileURI, Iterable iter, ExtractionResult out) { Resource node = YAMLExtractor.this.makeUri(); out.writeTriple(node, RDF.TYPE, RDF.LIST); Resource pList = null; // previous RDF iter node Resource cList = node; // cutternt RDF iter node - Iterator listIter = iter.iterator(); + Iterator listIter = iter.iterator(); while (listIter.hasNext()) { // If previous RDF iter node is given lint with current one if (pList != null) { @@ -187,7 +185,7 @@ private Resource makeUri(String type, IRI docUri, boolean addId) { uriString = uriString + "_" + Integer.toString(nodeId); } - Resource node = RDFUtils.uri(uriString); + Resource node = RDFUtils.iri(uriString); if (addId) { nodeId++; } diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractorFactory.java index 82cdc23a7..cb090ab1e 100644 --- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractorFactory.java @@ -25,27 +25,26 @@ /** * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com) */ -public class YAMLExtractorFactory extends SimpleExtractorFactory - implements ExtractorFactory +public class YAMLExtractorFactory extends SimpleExtractorFactory implements ExtractorFactory { - public static final String NAME = "yaml"; + public static final String NAME = "yaml"; - public static final Prefixes prefixes = null; + public static final Prefixes prefixes = null; - private static final ExtractorDescription descriptionInstance = new YAMLExtractorFactory(); + private static final ExtractorDescription descriptionInstance = new YAMLExtractorFactory(); - public YAMLExtractorFactory() { - super(NAME, prefixes, Arrays.asList("text/x-yaml;q=0.5"), "example.yaml"); - } + public YAMLExtractorFactory() { + super(NAME, prefixes, Arrays.asList("text/x-yaml;q=0.5"), "example.yaml"); + } - @Override - public YAMLExtractor createExtractor() { - return new YAMLExtractor(); - } + @Override + public YAMLExtractor createExtractor() { + return new YAMLExtractor(); + } - public static ExtractorDescription getDescriptionInstance() { - return descriptionInstance; - } + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } } diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java index bbfe5ec68..3b074eb2e 100644 --- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java +++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java @@ -71,7 +71,8 @@ public class RDFUtils { */ public static String fixAbsoluteIRI(String uri) { String fixed = fixIRIWithException(uri); - if (!fixed.matches("[a-zA-Z0-9]+:/.*")) throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri); + if (!fixed.matches("[a-zA-Z0-9]+:/.*")) + throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri); // Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path. if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) { fixed = fixed + "/"; @@ -129,7 +130,8 @@ public static String toXSDDateTime(Date date) { * @return the unescaped string. */ public static String fixIRIWithException(String unescapedIRI) { - if (unescapedIRI == null) throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null"); + if (unescapedIRI == null) + throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null"); // Remove starting and ending whitespace String escapedIRI = unescapedIRI.trim(); @@ -141,7 +143,8 @@ public static String fixIRIWithException(String unescapedIRI) { escapedIRI = escapedIRI.replaceAll("\n", ""); //'Remove starting "\" or '"' - if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) escapedIRI = escapedIRI.substring(1); + if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) + escapedIRI = escapedIRI.substring(1); //Remove ending "\" or '"' if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\"")) escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1); @@ -406,7 +409,8 @@ public static Statement quad(String s, String p, String o, String g) { * @return a value instance. */ public static Value toValue(String s) { - if ("a".equals(s)) return RDF.TYPE; + if ("a".equals(s)) + return RDF.TYPE; if (s.matches("[a-z0-9]+:.*")) { return PopularPrefixes.get().expand(s); } @@ -466,7 +470,8 @@ public static RDFWriter getWriter(RDFFormat format, OutputStream os) { * @throws IllegalArgumentException if no extension matches. */ public static Optional getFormatByExtension(String ext) { - if( ! ext.startsWith(".") ) ext = "." + ext; + if( ! ext.startsWith(".") ) + ext = "." + ext; return Rio.getParserFormatForFileName(ext); } @@ -537,12 +542,12 @@ public static Statement[] parseRDF(RDFFormat format, String in) * @throws org.eclipse.rdf4j.rio.RDFParseException if an error occurs while parsing file. */ public static Statement[] parseRDF(String resource) throws RDFHandlerException, IOException, RDFParseException { - final int extIndex = resource.lastIndexOf("."); + final int extIndex = resource.lastIndexOf('.'); if(extIndex == -1) throw new IllegalArgumentException("Error while detecting the extension in resource name " + resource); final String extension = resource.substring(extIndex + 1); return parseRDF( getFormatByExtension(extension).orElseThrow(Rio.unsupportedFormat(extension)) - , RDFUtils.class.getResourceAsStream(resource) ); + , RDFUtils.class.getResourceAsStream(resource) ); } /** @@ -564,6 +569,4 @@ public static boolean isAbsoluteIRI(String href) { } } - private RDFUtils() {} - } diff --git a/core/src/main/java/org/apache/any23/util/StreamUtils.java b/core/src/main/java/org/apache/any23/util/StreamUtils.java index 2022f0eb6..761c16094 100644 --- a/core/src/main/java/org/apache/any23/util/StreamUtils.java +++ b/core/src/main/java/org/apache/any23/util/StreamUtils.java @@ -17,10 +17,17 @@ package org.apache.any23.util; +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; +import org.apache.xerces.impl.io.MalformedByteSequenceException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.xml.sax.SAXException; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; @@ -28,6 +35,18 @@ import java.util.ArrayList; import java.util.List; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Result; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.TransformerFactoryConfigurationError; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + /** * Contains general utility functions for handling streams. * @@ -48,7 +67,7 @@ private StreamUtils(){} */ public static String[] asLines(InputStream is) throws IOException { final BufferedReader br = new BufferedReader(new InputStreamReader(is)); - final List lines = new ArrayList(); + final List lines = new ArrayList<>(); try { String line; while ((line = br.readLine()) != null) { @@ -78,7 +97,8 @@ public static String asString(InputStream is, boolean preserveNL) throws IOExcep String line; while ((line = br.readLine()) != null) { content.append(line); - if(preserveNL) content.append('\n'); + if(preserveNL) + content.append('\n'); } return content.toString(); } finally { @@ -93,9 +113,9 @@ public static String asString(InputStream is, boolean preserveNL) throws IOExcep * @return the string content. * @throws IOException if an error occurs while consuming the is stream. */ - public static String asString(InputStream is) throws IOException { - return asString(is, false); - } + public static String asString(InputStream is) throws IOException { + return asString(is, false); + } /** * Closes the closable interface and reports error if any. @@ -112,4 +132,48 @@ public static void closeGracefully(Closeable closable) { } } + /** + * Converts a {@link org.w3c.dom.Document} to an + * {@link java.io.InputStream} + * @throws TransformerFactoryConfigurationError + * @throws TransformerConfigurationException + */ + public static InputStream documentToInputStream(Document doc) + throws TransformerConfigurationException, TransformerFactoryConfigurationError { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + Source xmlSource = new DOMSource(doc); + Result outputTarget = new StreamResult(outputStream); + try { + TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget); + } catch (TransformerException e) { + logger.error("Error during transformation: {}", e); + } + return new ByteArrayInputStream(outputStream.toByteArray()); + } + + public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException { + DocumentBuilderFactory factory = null; + DocumentBuilder builder = null; + Document doc = null; + + try { + factory = DocumentBuilderFactory.newInstance(); + builder = factory.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + logger.error("Error converting InputStream to Document: {}", e); + } + + try { + BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE); + if (bomIn.hasBOM()) { + @SuppressWarnings("unused") + int firstNonBOMByte = bomIn.read(); // Skips BOM + } + doc = builder.parse(bomIn); + } catch (SAXException | IOException e) { + logger.error("Error converting InputStream to Document: {}", e); + } + return doc; + } } diff --git a/core/src/main/resources/org/apache/any23/extractor/openie/example-openie.html b/core/src/main/resources/org/apache/any23/extractor/openie/example-openie.html new file mode 100644 index 000000000..60db78f7f --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/openie/example-openie.html @@ -0,0 +1,631 @@ + + + + + + + + + + + + AQUARIUS | PO.DAAC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+
+ +
+
+
+
+
+
+ +
+ + + + +
+ + +
+
+
+ +

AQUARIUS

+ + + +
+ + +

Mission Specification & Status

+ +

The Aquarius/SAC-D mission, launched on June 10, 2011,  is a joint venture between NASA and the Argentinean Space Agency (CONAE). The mission features the sea surface salinity sensor Aquarius and is the first mission with the primary goal of measuring sea surface salinity (SSS) from space. Data from Aquarius will play a large role in understanding both climate change and the global water cycle.

+ +

On June 7, 2015 at 12:53:17 UTC the Aquarius/SAC-D observatory suffered a mission-ending hardware failure resulting in the permanent cessation of data flows.  The entire Aquarius data record spans a full 3 year, 9 month period from 8/25/2011 – 6/7/2015.  Version 4.0 of the Aquarius data is the Official NASA end-of-prime mission data for the Aquarius/SAC-D mission.  While no further forward processing of data is possible, a release of a V5.0 end-of-mission dataset is expected in future.
The Aquarius/SAC-D satellite
+This instrument carries 3 radiometers, and 1 scatterometer.  They are operating at 1.4 GHz & 1.2 GHz respectively.  The data collected by the radiometer are being used together with sea surface temperature collected from another platform(s), to derive salinity data.  This is corrected for surface roughness using data from the Aquarius scatterometer.

+ +

The satellite will cross the equator at 6am and pm.  The Aquarius instrument will continuously point away from the sun to avoid glint.

+PO.DAAC will be providing Level 2 SSS data as well as gridded Level 3 degree SSS products generated by the Aquarius Ground Segment at Goddard.  Level 3 products will be produced with temporal resolutions of daily, 8 day, monthly, 3 months, and annual.  Monthly and seasonal climatology products from Aqaurius are also available. The Aquarius instrument will provide global coverage every 7 days. The spatial resolution at Level 2 will be approximately 100km.  L3 products are gridded at 1 degree spatial resolution.

+
+ + +
+ +
+ +
+ +
+
+
News and Announcements

OFFICIAL NASA AQUARIUS/SAC-D VERSION 4.0 END-OF-PRIME-MISSION DATA SET RELEASED

+ +

July 17, 2015

+ +

The PO.DAAC is pleased to announce the availability of the version 4.0 Aquarius/SAC-D data.  This is the official NASA/Aquarius Project end-of-prime-mission dataset spanning the complete 3 year, 9 nine month period of Aquarius science data availability, from August 25, 2011 through June 7, 2015 when an unrecoverable hardware failure caused the end of the mission. This end-of-prime mission dataset does not preclude future reprocessing.  An updated version (V5.0) is planned for release in 2016, and subsequent updates will be released when measurable improvements are achieved.

+ +

Data sets comprising this release include the Level 2 orbital data and Level 3 mapped salinity, wind speed, and derived density products at 1 degree spatial resolution for ascending, descending and combined passes and for the following time intervals: daily, 7 day, monthly, seasonal, annual.  New products (added since V3.0), in addition to Density, include 7-day and 28-day running mean products plus seasonal and monthly climatology datasets. Included as part of v4.0 are also a complementary set of similarly gridded L3 ancillary SST products. A summary of improvements with this new version of the Aquarius data is available here. All users are advised to work with v4.0 over any previous versions.   

+ +

The Aquarius v4.0 data sets are described and discoverable via the PO.DAAC data portal.   Access to these data is via PO.DAAC’s public FTP site:   ftp://podaac-ftp.jpl.nasa.gov/allData/aquarius/ .  The data are also accessible via a range of PO.DAAC tools and services: OPeNDAP, THREDDS, Aquarius Level 3 Browser, LAS, the HiTIDE L2 subsetter and associated Web Services.

+ +

The Aquarius Data Users Guide, Aquarius Data Validation Document, and other primary technical documentation , are available from the FTP site together with reader software.   General information regarding Aquarius/SAC-D mission is available from the mission website and also via PO.DAAC’s Aquarius  and salinity webpages.

+ +

Should you have any questions, please contact us at: podaac@podaac.jpl.nasa.gov

+ +

 

+ +

NASA's Aquarius Sea Surface Salinity

+ +

+ +

This video provides a global tour of sea surface salinity using measurements taken by NASA's Aquarius instrument aboard the Aquarius/SAC-D spacecraft over the period September 2011 through September 2014. Red represents areas of high salinity, while blue represents areas of low salinity. Aquarius is a focused effort to measure sea surface salinity and providing the global view of salinity variability needed for climate studies. The mission has been a collaboration between NASA and the Space Agency of Argentina (Comisión Nacional de Actividades Espaciales).

+ +

This video is public domain and can be downloaded at: http://svs.gsfc.nasa.gov/cgi-bin/details.cgi?aid=4234

+ +

See all "Official Aquarius/SAC-D" Announcements

+ +

See all PO.DAAC Announcements

+
+
+
+ +
+ +
+
+
+ + + +
+ +
+
+ +
+ +
+ +
+ +
+ +
+ +
+

PO.DAAC Mailing List

+ + + + + +

Upon successful submission, you will receive a confirmation e-mail in your inbox.

+ +
+ + +
+
+ +Image CAPTCHA
+ + +
Enter the characters shown in the image.
+
+
+ +
+ +
+ +
+ Clearance Number: CL05-0770 +
+
+ + + + + + + + + + diff --git a/core/src/test/java/org/apache/any23/extractor/openie/OpenIEExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/openie/OpenIEExtractorTest.java new file mode 100644 index 000000000..7fc725ec2 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/openie/OpenIEExtractorTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.openie; + +import static org.junit.Assert.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractionResultImpl; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.StreamUtils; +import org.apache.any23.writer.RDFXMLWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.eclipse.rdf4j.model.IRI; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorTest { + + private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class); + + private OpenIEExtractor extractor; + + @Before + public void setUp() throws Exception { + extractor = new OpenIEExtractor(); + } + + @After + public void tearDown() throws Exception { + extractor = null; + } + + @Test + public void testExtractFromHTMLDocument() + throws IOException, ExtractionException, TripleHandlerException { + final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/example-openie.html"); + extract(uri, "/org/apache/any23/extractor/openie/example-openie.html"); + } + + public void extract(IRI uri, String filePath) + throws IOException, ExtractionException, TripleHandlerException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final TripleHandler tHandler = new RDFXMLWriter(baos); + final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri); + final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler); + try { + extractor.run( + ExtractionParameters.newDefault(), + extractionContext, + StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)), + result + ); + } finally { + logger.debug(baos.toString()); + tHandler.close(); + result.close(); + } + } + +} diff --git a/core/src/test/java/org/apache/any23/extractor/openie/package-info.java b/core/src/test/java/org/apache/any23/extractor/openie/package-info.java new file mode 100644 index 000000000..3447c47d9 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/openie/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.openie; \ No newline at end of file diff --git a/pom.xml b/pom.xml index 23ab57fc0..f5c3d1cb2 100644 --- a/pom.xml +++ b/pom.xml @@ -455,6 +455,11 @@ rdf4j-rio-jsonld ${rdf4j.version}
+ + edu.washington.cs.knowitall.openie + openie_2.10 + 4.2.1 + diff --git a/test-resources/src/test/resources/org/apache/any23/extractor/openie/example-openie.html b/test-resources/src/test/resources/org/apache/any23/extractor/openie/example-openie.html new file mode 100644 index 000000000..60db78f7f --- /dev/null +++ b/test-resources/src/test/resources/org/apache/any23/extractor/openie/example-openie.html @@ -0,0 +1,631 @@ + + + + + + + + + + + + AQUARIUS | PO.DAAC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+
+ +
+
+
+
+
+
+ +
+ + + + +
+ + +
+
+
+ +

AQUARIUS

+ + + +
+ + +

Mission Specification & Status

+ +

The Aquarius/SAC-D mission, launched on June 10, 2011,  is a joint venture between NASA and the Argentinean Space Agency (CONAE). The mission features the sea surface salinity sensor Aquarius and is the first mission with the primary goal of measuring sea surface salinity (SSS) from space. Data from Aquarius will play a large role in understanding both climate change and the global water cycle.

+ +

On June 7, 2015 at 12:53:17 UTC the Aquarius/SAC-D observatory suffered a mission-ending hardware failure resulting in the permanent cessation of data flows.  The entire Aquarius data record spans a full 3 year, 9 month period from 8/25/2011 – 6/7/2015.  Version 4.0 of the Aquarius data is the Official NASA end-of-prime mission data for the Aquarius/SAC-D mission.  While no further forward processing of data is possible, a release of a V5.0 end-of-mission dataset is expected in future.
The Aquarius/SAC-D satellite
+This instrument carries 3 radiometers, and 1 scatterometer.  They are operating at 1.4 GHz & 1.2 GHz respectively.  The data collected by the radiometer are being used together with sea surface temperature collected from another platform(s), to derive salinity data.  This is corrected for surface roughness using data from the Aquarius scatterometer.

+ +

The satellite will cross the equator at 6am and pm.  The Aquarius instrument will continuously point away from the sun to avoid glint.

+PO.DAAC will be providing Level 2 SSS data as well as gridded Level 3 degree SSS products generated by the Aquarius Ground Segment at Goddard.  Level 3 products will be produced with temporal resolutions of daily, 8 day, monthly, 3 months, and annual.  Monthly and seasonal climatology products from Aqaurius are also available. The Aquarius instrument will provide global coverage every 7 days. The spatial resolution at Level 2 will be approximately 100km.  L3 products are gridded at 1 degree spatial resolution.

+
+ + +
+ +
+ +
+ +
+
+
News and Announcements

OFFICIAL NASA AQUARIUS/SAC-D VERSION 4.0 END-OF-PRIME-MISSION DATA SET RELEASED

+ +

July 17, 2015

+ +

The PO.DAAC is pleased to announce the availability of the version 4.0 Aquarius/SAC-D data.  This is the official NASA/Aquarius Project end-of-prime-mission dataset spanning the complete 3 year, 9 nine month period of Aquarius science data availability, from August 25, 2011 through June 7, 2015 when an unrecoverable hardware failure caused the end of the mission. This end-of-prime mission dataset does not preclude future reprocessing.  An updated version (V5.0) is planned for release in 2016, and subsequent updates will be released when measurable improvements are achieved.

+ +

Data sets comprising this release include the Level 2 orbital data and Level 3 mapped salinity, wind speed, and derived density products at 1 degree spatial resolution for ascending, descending and combined passes and for the following time intervals: daily, 7 day, monthly, seasonal, annual.  New products (added since V3.0), in addition to Density, include 7-day and 28-day running mean products plus seasonal and monthly climatology datasets. Included as part of v4.0 are also a complementary set of similarly gridded L3 ancillary SST products. A summary of improvements with this new version of the Aquarius data is available here. All users are advised to work with v4.0 over any previous versions.   

+ +

The Aquarius v4.0 data sets are described and discoverable via the PO.DAAC data portal.   Access to these data is via PO.DAAC’s public FTP site:   ftp://podaac-ftp.jpl.nasa.gov/allData/aquarius/ .  The data are also accessible via a range of PO.DAAC tools and services: OPeNDAP, THREDDS, Aquarius Level 3 Browser, LAS, the HiTIDE L2 subsetter and associated Web Services.

+ +

The Aquarius Data Users Guide, Aquarius Data Validation Document, and other primary technical documentation , are available from the FTP site together with reader software.   General information regarding Aquarius/SAC-D mission is available from the mission website and also via PO.DAAC’s Aquarius  and salinity webpages.

+ +

Should you have any questions, please contact us at: podaac@podaac.jpl.nasa.gov

+ +

 

+ +

NASA's Aquarius Sea Surface Salinity

+ +

+ +

This video provides a global tour of sea surface salinity using measurements taken by NASA's Aquarius instrument aboard the Aquarius/SAC-D spacecraft over the period September 2011 through September 2014. Red represents areas of high salinity, while blue represents areas of low salinity. Aquarius is a focused effort to measure sea surface salinity and providing the global view of salinity variability needed for climate studies. The mission has been a collaboration between NASA and the Space Agency of Argentina (Comisión Nacional de Actividades Espaciales).

+ +

This video is public domain and can be downloaded at: http://svs.gsfc.nasa.gov/cgi-bin/details.cgi?aid=4234

+ +

See all "Official Aquarius/SAC-D" Announcements

+ +

See all PO.DAAC Announcements

+
+
+
+ +
+ +
+
+
+ + + +
+ +
+
+ +
+ +
+ +
+ + + + + + + + + + + +