From 455409bf80801152e7c855ddc994fedc32c4cfcf Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Mon, 12 Feb 2024 13:11:47 -0500 Subject: [PATCH] TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591) * TIKA-4195 -- jsoup parser conceals backoff to default encoding --- .../apache/tika/detect/AutoDetectReader.java | 38 ++++++++++--------- .../detect/CompositeEncodingDetector.java | 7 ++++ .../tika/metadata/TikaCoreProperties.java | 16 ++++++++ .../tika/parser/html/HtmlParserTest.java | 2 +- .../apache/tika/parser/txt/TXTParserTest.java | 2 + .../parser/RecursiveParserWrapperTest.java | 5 ++- 6 files changed, 49 insertions(+), 21 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index 5cb920aaee..bd7d4f2a95 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -22,8 +22,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; -import java.util.Collections; -import java.util.List; import org.xml.sax.InputSource; @@ -31,6 +29,7 @@ import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; @@ -68,26 +67,27 @@ private AutoDetectReader(InputStream stream, Charset charset) throws IOException /** * @param stream stream from which to read -- make sure that it supports mark! * @param metadata - * @param detectors + * @param detector * @param handler * @throws IOException * @throws TikaException */ private AutoDetectReader(InputStream stream, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { - this(stream, detect(stream, metadata, detectors, handler)); + this(stream, detect(stream, metadata, detector, handler)); } public AutoDetectReader(InputStream stream, Metadata metadata, EncodingDetector encodingDetector) throws IOException, TikaException { - this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector), + this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader) throws IOException, TikaException { - this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class), + this(getBuffered(stream), metadata, + new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)), loader.getLoadErrorHandler()); } @@ -101,19 +101,17 @@ public AutoDetectReader(InputStream stream) throws IOException, TikaException { } private static Charset detect(InputStream input, Metadata metadata, - List detectors, LoadErrorHandler handler) + EncodingDetector detector, LoadErrorHandler handler) throws IOException, TikaException { // Ask all given detectors for the character encoding - for (EncodingDetector detector : detectors) { - try { - Charset charset = detector.detect(input, metadata); - if (charset != null) { - return charset; - } - } catch (NoClassDefFoundError e) { - // TIKA-1041: Detector dependencies not present. - handler.handleLoadError(detector.getClass().getName(), e); + try { + Charset charset = detector.detect(input, metadata); + if (charset != null) { + return charset; } + } catch (NoClassDefFoundError e) { + // TIKA-1041: Detector dependencies not present. + handler.handleLoadError(detector.getClass().getName(), e); } // Try determining the encoding based on hints in document metadata @@ -122,7 +120,11 @@ private static Charset detect(InputStream input, Metadata metadata, String charset = type.getParameters().get("charset"); if (charset != null) { try { - return CharsetUtils.forName(charset); + Charset cs = CharsetUtils.forName(charset); + metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name()); + metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + "AutoDetectReader-charset-metadata-fallback"); + return cs; } catch (IllegalArgumentException e) { // ignore } diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index a50b7e4bc9..7db79ccc7b 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; public class CompositeEncodingDetector implements EncodingDetector, Serializable { @@ -64,6 +65,12 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { for (EncodingDetector detector : getDetectors()) { Charset detected = detector.detect(input, metadata); if (detected != null) { + metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name()); + //if this has been set by a leaf detector, do not overwrite + if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { + metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + detector.getClass().getSimpleName()); + } return detected; } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index a75eb8acf7..6ff02c1cf9 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -322,6 +322,22 @@ public interface TikaCoreProperties { //is the file encrypted Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted"); + /** + * When an EncodingDetector detects an encoding, the encoding should be stored in this field. + * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser + * chooses to use for processing a file. If an EncodingDetector returns "null", a parser + * may choose to use a default encoding. We want to differentiate between a parser using a + * default encoding and the output of an EncodingDetector. + */ + Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding"); + + + /** + * This should be the simple class name for the EncodingDetectors whose detected encoding + * was used in the parse. + */ + Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector"); + /** * General metadata key for the count of non-final versions available within a file. This * was added initially to support generalizing incremental updates in PDF. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 279040bcf0..502911fd33 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1055,7 +1055,7 @@ public void startElement(String uri, String local, String name, } assertEquals(1, (int) tagFrequencies.get("title")); - assertEquals(9, (int) tagFrequencies.get("meta")); + assertEquals(11, (int) tagFrequencies.get("meta")); assertEquals(12, (int) tagFrequencies.get("link")); assertEquals(6, (int) tagFrequencies.get("script")); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index 576d6f6d9c..f39be16c47 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -106,6 +106,8 @@ public void testLatinDetectionHeuristics() throws Exception { parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext()); assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("UniversalEncodingDetector", metadata.get(TikaCoreProperties.ENCODING_DETECTOR)); + assertEquals("windows-1252", metadata.get(TikaCoreProperties.DETECTED_ENCODING)); metadata = new Metadata(); parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(), diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 61eeab14dc..17b18646a9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -148,8 +148,9 @@ public void testCharLimitNoThrowOnWriteLimit() throws Exception { assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); - assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); - assertNotContained("them to the separation", + assertContains("necessary for one people", + list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); + assertNotContained("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); }