Skip to content
This repository has been archived by the owner on Jul 3, 2023. It is now read-only.

ANY23-304 Add extractor for OpenIE 1st pass #33

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
70 changes: 35 additions & 35 deletions api/src/main/java/org/apache/any23/vocab/YAML.java
Expand Up @@ -24,52 +24,52 @@
*/
public class YAML extends Vocabulary {

/*
* Namespace of YAML vocabulary
*/
public static final String NS = "http://yaml.org/spec/1.2/spec.html#";
/*
* Namespace of YAML vocabulary
*/
public static final String NS = "http://yaml.org/spec/1.2/spec.html#";

public static final String PREFIX = "yaml";
public static final String PREFIX = "yaml";

public static final String ROOT = "Root";
public static final String ROOT = "Root";

public static final String DOCUMENT = "Document";
public static final String DOCUMENT = "Document";

public static final String NODE = "Node";
public static final String NODE = "Node";

public static final String CONTAINS = "contains";
public static final String CONTAINS = "contains";

private static final YAML _instance = new YAML();
private static final YAML _instance = new YAML();

private YAML() {
super(NS);
}
private YAML() {
super(NS);
}

public static YAML getInstance() {
return _instance;
}
public static YAML getInstance() {
return _instance;
}

/**
* <p>The root node. Representation of the YAML file. NB: one file may contain more than one documents
* represented by nodes; e.g. </p>
* <p>
* <code>
* %YAML 1.2
* ---
* - data1
* - data2
* ---
* - data3
* </code>
* </p>
* Contains two documents.
*/
public final IRI root = createProperty(NS, ROOT);
/**
* <p>The root node. Representation of the YAML file. NB: one file may contain more than one documents
* represented by nodes; e.g. </p>
* <p>
* <code>
* %YAML 1.2
* ---
* - data1
* - data2
* ---
* - data3
* </code>
* </p>
* Contains two documents.
*/
public final IRI root = createProperty(NS, ROOT);

public final IRI document = createProperty(NS, DOCUMENT);
public final IRI document = createProperty(NS, DOCUMENT);

public final IRI node = createProperty(NS, NODE);
public final IRI node = createProperty(NS, NODE);

public final IRI contains = createProperty(NS, CONTAINS);
public final IRI contains = createProperty(NS, CONTAINS);

}
5 changes: 4 additions & 1 deletion core/pom.xml
Expand Up @@ -83,7 +83,10 @@
<artifactId>snakeyaml</artifactId>
<version>1.17</version>
</dependency>

<dependency>
<groupId>edu.washington.cs.knowitall.openie</groupId>
<artifactId>openie_2.10</artifactId>
</dependency>
<!-- BEGIN: Tika -->
<dependency>
<groupId>org.apache.tika</groupId>
Expand Down
Expand Up @@ -51,7 +51,7 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {

private IRI profile;

private Map<String, IRI> prefixes = new HashMap<String, IRI>();
private Map<String, IRI> prefixes = new HashMap<>();

private String documentLang;

Expand Down Expand Up @@ -82,29 +82,29 @@ public void run(
lang = meta.getLang();
}
if(meta.isPragmaDirective){
if(lang != null) {
out.writeTriple(
if(lang != null) {
out.writeTriple(
documentIRI,
meta.getHttpEquiv(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
} else {
} else {
out.writeTriple(
documentIRI,
meta.getHttpEquiv(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
}
}
}else {
if(lang != null) {
out.writeTriple(
if(lang != null) {
out.writeTriple(
documentIRI,
meta.getName(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
} else {
out.writeTriple(
} else {
out.writeTriple(
documentIRI,
meta.getName(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
}
}
}
}
}
Expand All @@ -117,15 +117,15 @@ public void run(
*/
private String getDocumentLanguage(Document in) {
String lang = DomUtils.find(in, "string(/HTML/@lang)");
if (lang.equals("")) {
if ("".equals(lang)) {
return null;
}
return lang;
}

private IRI extractProfile(Document in) {
String profile = DomUtils.find(in, "string(/HTML/@profile)");
if (profile.equals("")) {
if ("".equals(profile)) {
return null;
}
return SimpleValueFactory.getInstance().createIRI(profile);
Expand All @@ -150,7 +150,7 @@ private void extractLinkDefinedPrefixes(Document in) {

private Set<Meta> extractMetaElement(Document in, String baseProfile) {
List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
Set<Meta> result = new HashSet<Meta>();
Set<Meta> result = new HashSet<>();
for (Node metaNode : metaNodes) {
NamedNodeMap attributes = metaNode.getAttributes();
Node nameAttribute = attributes.getNamedItem("name");
Expand Down Expand Up @@ -281,12 +281,15 @@ public void setContent(String content) {

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;

Meta meta = (Meta) o;

if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
return false;

return true;
}
Expand Down
Expand Up @@ -82,7 +82,7 @@ public ExtractorDescription getDescription() {
* @param ns the list of script nodes.
*/
private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
if(ns.size() > 0 && turtleParser == null) {
if(!ns.isEmpty() && turtleParser == null) {
turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
}
for(Node n : ns) {
Expand Down
@@ -0,0 +1,111 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.openie;

import java.io.IOException;
import java.util.List;

import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactoryConfigurationError;

import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.util.StreamUtils;
import org.eclipse.rdf4j.model.IRI;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

import edu.knowitall.openie.Argument;
import edu.knowitall.openie.Instance;
import edu.knowitall.openie.OpenIE;
import edu.knowitall.tool.parse.ClearParser;
import edu.knowitall.tool.postag.ClearPostagger;
import edu.knowitall.tool.srl.ClearSrl;
import edu.knowitall.tool.tokenize.ClearTokenizer;
import scala.collection.JavaConversions;
import scala.collection.Seq;



/**
* An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
* extractor able to generate <i>RDF</i> statements from
* sentences representing relations in the text.
*/
public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {

private final Logger LOG = LoggerFactory.getLogger(getClass());

private IRI documentRoot;

/**
* default constructor
*/
OpenIEExtractor() {
// default constructor
}

/**
* @see org.apache.any23.extractor.Extractor#getDescription()
*/
@Override
public ExtractorDescription getDescription() {
return OpenIEExtractorFactory.getDescriptionInstance();
}

@Override
public void run(ExtractionParameters extractionParameters,
ExtractionContext context, Document in, ExtractionResult out)
throws IOException, ExtractionException {
OpenIE openIE = new OpenIE(new ClearParser(new ClearPostagger(new ClearTokenizer())), new ClearSrl(), false, false);


Seq<Instance> extractions = null;
try {
extractions = openIE.extract(StreamUtils.asString(StreamUtils.documentToInputStream(in)));
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
LOG.error("Error during extraction: {}", e);
}

List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
for(Instance instance : listExtractions) {
StringBuilder sb = new StringBuilder();

sb.append(instance.confidence())
.append('\t')
.append(instance.extr().context())
.append('\t')
.append(instance.extr().arg1().text())
.append('\t')
.append(instance.extr().rel().text())
.append('\t');

List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
for(Argument argument : listArg2s) {
sb.append(argument.text()).append("; ");
}
System.out.println(sb.toString());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presuming this is stub code right now that will eventually send results to the ExtractorResult.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, correct.

}
}

}
@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.openie;

import java.util.Arrays;

import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.extractor.SimpleExtractorFactory;
import org.apache.any23.rdf.Prefixes;

/**
* @author lewismc
*
*/
public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
implements ExtractorFactory<OpenIEExtractor> {

public static final String NAME = "openie";

public static final Prefixes prefixes = null;

private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory();

public OpenIEExtractorFactory() {
super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html");
}

@Override
public OpenIEExtractor createExtractor() {
return new OpenIEExtractor();
}

public static ExtractorDescription getDescriptionInstance() {
return descriptionInstance;
}


}
@@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* This package provides an
* <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
* extractor able to generate <i>RDF</i> statements from
* sentences representing relations in the text.
*/
package org.apache.any23.extractor.openie;