From d69558268b5d8e8d57f00d94b864c54ec2eaf75f Mon Sep 17 00:00:00 2001 From: Hans Date: Wed, 24 Jan 2018 19:58:25 -0600 Subject: [PATCH] ANY23-291 Allow JSONLD scripts to be located anywhere in document --- .../html/EmbeddedJSONLDExtractor.java | 2 +- .../html/EmbeddedJSONLDExtractorTest.java | 14 ++++++ .../html-body-embedded-jsonld-extractor.html | 37 +++++++++++++++ ...ad-and-body-embedded-jsonld-extractor.html | 47 +++++++++++++++++++ 4 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 test-resources/src/test/resources/html/html-body-embedded-jsonld-extractor.html create mode 100644 test-resources/src/test/resources/html/html-head-and-body-embedded-jsonld-extractor.html diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java index 1e6efdfb3..aeffdda7b 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java @@ -137,7 +137,7 @@ private Set extractJSONLDScript(Document in, String baseProfile, ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out) throws IOException, ExtractionException { - List scriptNodes = DomUtils.findAll(in, "/HTML/HEAD/SCRIPT"); + List scriptNodes = DomUtils.findAll(in, "//SCRIPT"); Set result = new HashSet<>(); extractor = new JSONLDExtractorFactory().createExtractor(); for (Node jsonldNode : scriptNodes) { diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java index 70baa308a..6e7bfa448 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java @@ -39,6 +39,20 @@ public void testSeveralEmbeddedJSONLDInHead() throws Exception { assertStatementsSize(null, null, null, 7); } + @Test + public void testEmbeddedJSONLDInBody() throws Exception { + assertExtract("/html/html-body-embedded-jsonld-extractor.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 3); + } + + @Test + public void testEmbeddedJSONLDInHeadAndBody() throws Exception { + assertExtract("/html/html-head-and-body-embedded-jsonld-extractor.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 7); + } + @Override protected ExtractorFactory getExtractorFactory() { return new EmbeddedJSONLDExtractorFactory(); diff --git a/test-resources/src/test/resources/html/html-body-embedded-jsonld-extractor.html b/test-resources/src/test/resources/html/html-body-embedded-jsonld-extractor.html new file mode 100644 index 000000000..7efce2d69 --- /dev/null +++ b/test-resources/src/test/resources/html/html-body-embedded-jsonld-extractor.html @@ -0,0 +1,37 @@ + + + + Hello World! + + + +

Embedded JSONLD Extractor

+

It extracts only the embedded JSON-LD elements. +

+ +
+ + \ No newline at end of file diff --git a/test-resources/src/test/resources/html/html-head-and-body-embedded-jsonld-extractor.html b/test-resources/src/test/resources/html/html-head-and-body-embedded-jsonld-extractor.html new file mode 100644 index 000000000..f8ce07149 --- /dev/null +++ b/test-resources/src/test/resources/html/html-head-and-body-embedded-jsonld-extractor.html @@ -0,0 +1,47 @@ + + + + Hello World! + + + + + +

Embedded JSONLD Extractor

+

It extracts only the embedded JSON-LD elements. + + + + \ No newline at end of file