Skip to content
Permalink
Browse files
ANY23-137 RDFa parser implementation proposal
  • Loading branch information
lewismc committed May 9, 2014
2 parents 7934f79 + 4ce8814 commit c224e2658e6ac7eb1e9a3066dc0a24aeb9e5457f
Show file tree
Hide file tree
Showing 20 changed files with 362 additions and 548 deletions.
@@ -133,6 +133,10 @@
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-repository-api</artifactId>
</dependency>
<dependency>
<groupId>org.semarglproject</groupId>
<artifactId>semargl-sesame</artifactId>
</dependency>
<!-- END: Sesame -->

<!-- BEGIN: Misc -->
@@ -105,7 +105,7 @@ public void run(
} catch (RDFHandlerException ex) {
throw new IllegalStateException("Unexpected exception.", ex);
} catch (RDFParseException ex) {
throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
// throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
}
}

@@ -28,6 +28,8 @@
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.Rio;
import org.openrdf.rio.helpers.RDFaParserSettings;
import org.openrdf.rio.helpers.RDFaVersion;
import org.openrdf.rio.turtle.TurtleParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,7 +58,7 @@ public static RDFParserFactory getInstance() {
}

/**
* Returns a new instance of a configured {@link org.openrdf.rio.turtle.TurtleParser}.
* Returns a new instance of a configured TurtleParser.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
@@ -79,7 +81,49 @@ public RDFParser getTurtleParserInstance(
}

/**
* Returns a new instance of a configured {@link org.openrdf.rio.rdfxml.RDFXMLParser}.
* Returns a new instance of a configured RDFaParser, set to RDFa-1.0 compatibility mode.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
* @param extractionContext the extraction context where the parser is used.
* @param extractionResult the output extraction result.
* @return a new instance of a configured RDFXML parser.
*/
public RDFParser getRDFa10Parser(
final boolean verifyDataType,
final boolean stopAtFirstError,
final ExtractionContext extractionContext,
final ExtractionResult extractionResult
) {
final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_0);
configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
return parser;
}

/**
* Returns a new instance of a configured RDFaParser, set to RDFa-1.1 compatibility mode.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
* @param extractionContext the extraction context where the parser is used.
* @param extractionResult the output extraction result.
* @return a new instance of a configured RDFXML parser.
*/
public RDFParser getRDFa11Parser(
final boolean verifyDataType,
final boolean stopAtFirstError,
final ExtractionContext extractionContext,
final ExtractionResult extractionResult
) {
final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
return parser;
}

/**
* Returns a new instance of a configured RDFXMLParser.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
@@ -99,7 +143,7 @@ public RDFParser getRDFXMLParser(
}

/**
* Returns a new instance of a configured {@link org.openrdf.rio.ntriples.NTriplesParser}.
* Returns a new instance of a configured NTriplesParser.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
@@ -119,7 +163,7 @@ public RDFParser getNTriplesParser(
}

/**
* Returns a new instance of a configured {@link org.apache.any23.io.nquads.NQuadsParser}.
* Returns a new instance of a configured NQuadsParser.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
@@ -139,7 +183,7 @@ public RDFParser getNQuadsParser(
}

/**
* Returns a new instance of a configured {@link TriXParser}.
* Returns a new instance of a configured TriXParser.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
@@ -18,91 +18,37 @@
package org.apache.any23.extractor.rdfa;

import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.w3c.dom.Document;

import java.io.IOException;
import java.net.URL;
import org.apache.any23.extractor.rdf.BaseRDFExtractor;
import org.apache.any23.extractor.rdf.RDFParserFactory;
import org.openrdf.rio.RDFParser;

/**
* {@link org.apache.any23.extractor.Extractor} implementation for
* <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.1</a> specification.
* <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> specification.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class RDFa11Extractor implements Extractor.TagSoupDOMExtractor {

private final RDFa11Parser parser;

private boolean verifyDataType;
public class RDFa11Extractor extends BaseRDFExtractor {

private boolean stopAtFirstError;

/**
* Constructor, allows to specify the validation and error handling
* policies.
*
* @param verifyDataType
* if <code>true</code> the data types will be verified, if
* <code>false</code> will be ignored.
* @param stopAtFirstError
* if <code>true</code> the parser will stop at first parsing
* error, if <code>false</code> will ignore non blocking errors.
*/
public RDFa11Extractor(boolean verifyDataType, boolean stopAtFirstError) {
this.parser = new RDFa11Parser();
this.verifyDataType = verifyDataType;
this.stopAtFirstError = stopAtFirstError;
super(verifyDataType, stopAtFirstError);
}

/**
* Default constructor, with no verification of data types and not stop at
* first error.
*/
public RDFa11Extractor() {
this(false, false);
}

public boolean isVerifyDataType() {
return verifyDataType;
}

public void setVerifyDataType(boolean verifyDataType) {
this.verifyDataType = verifyDataType;
}

public boolean isStopAtFirstError() {
return stopAtFirstError;
}

public void setStopAtFirstError(boolean stopAtFirstError) {
this.stopAtFirstError = stopAtFirstError;
}

@Override
public void run(ExtractionParameters extractionParameters,
ExtractionContext extractionContext, Document in,
ExtractionResult out) throws IOException, ExtractionException {
try {
parser.processDocument(new URL(extractionContext.getDocumentURI()
.toString()), in, out);
} catch (RDFa11ParserException rpe) {
throw new ExtractionException("Error while performing extraction.",
rpe);
}
}

/**
* @return the {@link org.apache.any23.extractor.ExtractorDescription} of
* this extractor
*/
@Override
public ExtractorDescription getDescription() {
return RDFa11ExtractorFactory.getDescriptionInstance();
}

@Override
protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) {
return RDFParserFactory.getInstance().getRDFa11Parser(
isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult
);
}
}
@@ -17,147 +17,38 @@

package org.apache.any23.extractor.rdfa;

import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.rdf.BaseRDFExtractor;
import org.apache.any23.extractor.rdf.RDFParserFactory;
import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.w3c.dom.Document;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;

/**
* Extractor for RDFa in HTML, based on Fabien Gadon's XSLT transform, found
* <a href="http://ns.inria.fr/grddl/rdfa/">here</a>. It works by first
* parsing the HTML using a tagsoup parser, then applies the XSLT to the
* DOM tree, then parses the resulting RDF/XML.
* {@link org.apache.any23.extractor.Extractor} implementation for
* <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> specification.
*
* @author Gabriele Renzi
* @author Richard Cyganiak (richard@cyganiak.de)
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class RDFaExtractor implements TagSoupDOMExtractor {

public final static String NAME = "html-rdfa";

public final static String xsltFilename =
DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");

private static XSLTStylesheet xslt = null;

/**
* Returns a {@link XSLTStylesheet} able to distill RDFa from
* HTML pages.
*
* @return returns a not <code>null</code> XSLT instance.
*/
public static synchronized XSLTStylesheet getXSLT() {
// Lazily initialized static instance, so we don't parse
// the XSLT unless really necessary, and only once
if (xslt == null) {
InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
if (in == null) {
throw new RuntimeException("Couldn't load '" + xsltFilename +
"', maybe the file is not bundled in the jar?");
}
xslt = new XSLTStylesheet(in);
}
return xslt;
}

private boolean verifyDataType;
public class RDFaExtractor extends BaseRDFExtractor {

private boolean stopAtFirstError;

/**
* Constructor, allows to specify the validation and error handling policies.
*
* @param verifyDataType if <code>true</code> the data types will be verified,
* if <code>false</code> will be ignored.
* @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
* if <code>false</code> will ignore non blocking errors.
*/
public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
this.verifyDataType = verifyDataType;
this.stopAtFirstError = stopAtFirstError;
super(verifyDataType, stopAtFirstError);
}

/**
* Default constructor, with no verification of data types and not stop at first error.
*/
public RDFaExtractor() {
this(false, false);
}

public boolean isVerifyDataType() {
return verifyDataType;
}

public void setVerifyDataType(boolean verifyDataType) {
this.verifyDataType = verifyDataType;
}

public boolean isStopAtFirstError() {
return stopAtFirstError;
}

public void setStopAtFirstError(boolean stopAtFirstError) {
this.stopAtFirstError = stopAtFirstError;
}

@Override
public void run(
ExtractionParameters extractionParameters,
ExtractionContext extractionContext,
Document in,
ExtractionResult out
) throws IOException, ExtractionException {

StringWriter buffer = new StringWriter();
try {
getXSLT().applyTo(in, buffer);
} catch (XSLTStylesheetException xslte) {
throw new ExtractionException("An error occurred during the XSLT application.", xslte);
}

try {
RDFParser parser
= RDFParserFactory.getInstance().getRDFXMLParser(
verifyDataType, stopAtFirstError, extractionContext, out
);
parser.parse(
new StringReader(buffer.getBuffer().toString()),
extractionContext.getDocumentURI().stringValue()
);
} catch (RDFHandlerException ex) {
throw new IllegalStateException(
"Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
);
} catch (RDFParseException ex) {
throw new ExtractionException(
"Invalid RDF/XML produced by RDFa transform.", ex, out
);
}
}

private String getDocType(Document in) {
return in.getDoctype().getPublicId();
}

/**
* @return the {@link org.apache.any23.extractor.ExtractorDescription} of this extractor
*/
@Override
public ExtractorDescription getDescription() {
return RDFaExtractorFactory.getDescriptionInstance();
}

@Override
protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) {
return RDFParserFactory.getInstance().getRDFa10Parser(
isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult
);
}
}
@@ -19,6 +19,7 @@

import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.rdfa.RDFaExtractor;
import org.apache.any23.extractor.rdfa.RDFaExtractorFactory;
import org.apache.any23.vocab.XHTML;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
@@ -95,7 +96,7 @@ public void close() throws TripleHandlerException {
}

private boolean isRDFaContext(ExtractionContext context) {
return context.getExtractorName().equals(RDFaExtractor.NAME);
return context.getExtractorName().equals(RDFaExtractorFactory.NAME);
}

public void endDocument(URI documentURI) throws TripleHandlerException {

0 comments on commit c224e26

Please sign in to comment.