From 9024f12e0fbef206330b6317a537354fafbe9cd7 Mon Sep 17 00:00:00 2001 From: Joseph Naegele Date: Fri, 20 May 2016 21:02:56 +0000 Subject: [PATCH] fix for TIKA-1980 contributed by naegelejd --- .../apache/tika/sax/XHTMLContentHandler.java | 5 +-- .../tika/parser/html/HtmlParserTest.java | 32 +++++++++++++++++++ .../test/resources/test-documents/head.html | 32 +++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 tika-parsers/src/test/resources/test-documents/head.html diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java index ada3367cdfd..4742339face 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java @@ -53,7 +53,7 @@ public class XHTMLContentHandler extends SafeContentHandler { * The elements that are in the section. */ private static final Set HEAD = - unmodifiableSet("title", "link", "base", "meta"); + unmodifiableSet("title", "link", "base", "meta", "script"); /** * The elements that are automatically emitted by lazyStartHead, so @@ -74,7 +74,8 @@ public class XHTMLContentHandler extends SafeContentHandler { public static final Set ENDLINE = unmodifiableSet( "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", "hr", "blockquote", "address", "fieldset", "table", "form", - "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"); + "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", + "option", "link", "script"); private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 8599e5a8314..ab6c58c8b7b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -39,6 +39,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; @@ -1168,6 +1169,37 @@ public void startElement( assertEquals(url, links.get(0)); } + @Test + public void testAllHeadElements() throws Exception { + // IdentityHtmlMapper is needed to extract + + + + + + + + + + + + + + + + + + + Apache Tika + + + + + + +