Skip to content

Commit

Permalink
TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encodi…
Browse files Browse the repository at this point in the history
…ng (#1591)

* TIKA-4195 -- jsoup parser conceals backoff to default encoding
  • Loading branch information
tballison committed Feb 12, 2024
1 parent c2acd71 commit 455409b
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 21 deletions.
Expand Up @@ -22,15 +22,14 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.List;

import org.xml.sax.InputSource;

import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;

Expand Down Expand Up @@ -68,26 +67,27 @@ private AutoDetectReader(InputStream stream, Charset charset) throws IOException
/**
* @param stream stream from which to read -- make sure that it supports mark!
* @param metadata
* @param detectors
* @param detector
* @param handler
* @throws IOException
* @throws TikaException
*/
private AutoDetectReader(InputStream stream, Metadata metadata,
List<EncodingDetector> detectors, LoadErrorHandler handler)
EncodingDetector detector, LoadErrorHandler handler)
throws IOException, TikaException {
this(stream, detect(stream, metadata, detectors, handler));
this(stream, detect(stream, metadata, detector, handler));
}

public AutoDetectReader(InputStream stream, Metadata metadata,
EncodingDetector encodingDetector) throws IOException, TikaException {
this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector),
this(getBuffered(stream), metadata, encodingDetector,
DEFAULT_LOADER.getLoadErrorHandler());
}

public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
throws IOException, TikaException {
this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class),
this(getBuffered(stream), metadata,
new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)),
loader.getLoadErrorHandler());
}

Expand All @@ -101,19 +101,17 @@ public AutoDetectReader(InputStream stream) throws IOException, TikaException {
}

private static Charset detect(InputStream input, Metadata metadata,
List<EncodingDetector> detectors, LoadErrorHandler handler)
EncodingDetector detector, LoadErrorHandler handler)
throws IOException, TikaException {
// Ask all given detectors for the character encoding
for (EncodingDetector detector : detectors) {
try {
Charset charset = detector.detect(input, metadata);
if (charset != null) {
return charset;
}
} catch (NoClassDefFoundError e) {
// TIKA-1041: Detector dependencies not present.
handler.handleLoadError(detector.getClass().getName(), e);
try {
Charset charset = detector.detect(input, metadata);
if (charset != null) {
return charset;
}
} catch (NoClassDefFoundError e) {
// TIKA-1041: Detector dependencies not present.
handler.handleLoadError(detector.getClass().getName(), e);
}

// Try determining the encoding based on hints in document metadata
Expand All @@ -122,7 +120,11 @@ private static Charset detect(InputStream input, Metadata metadata,
String charset = type.getParameters().get("charset");
if (charset != null) {
try {
return CharsetUtils.forName(charset);
Charset cs = CharsetUtils.forName(charset);
metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
"AutoDetectReader-charset-metadata-fallback");
return cs;
} catch (IllegalArgumentException e) {
// ignore
}
Expand Down
Expand Up @@ -26,6 +26,7 @@
import java.util.List;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

public class CompositeEncodingDetector implements EncodingDetector, Serializable {

Expand Down Expand Up @@ -64,6 +65,12 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException {
for (EncodingDetector detector : getDetectors()) {
Charset detected = detector.detect(input, metadata);
if (detected != null) {
metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name());
//if this has been set by a leaf detector, do not overwrite
if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
detector.getClass().getSimpleName());
}
return detected;
}
}
Expand Down
Expand Up @@ -322,6 +322,22 @@ public interface TikaCoreProperties {
//is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");

/**
* When an EncodingDetector detects an encoding, the encoding should be stored in this field.
* This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
* chooses to use for processing a file. If an EncodingDetector returns "null", a parser
* may choose to use a default encoding. We want to differentiate between a parser using a
* default encoding and the output of an EncodingDetector.
*/
Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");


/**
* This should be the simple class name for the EncodingDetectors whose detected encoding
* was used in the parse.
*/
Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");

/**
* General metadata key for the count of non-final versions available within a file. This
* was added initially to support generalizing incremental updates in PDF.
Expand Down
Expand Up @@ -1055,7 +1055,7 @@ public void startElement(String uri, String local, String name,
}

assertEquals(1, (int) tagFrequencies.get("title"));
assertEquals(9, (int) tagFrequencies.get("meta"));
assertEquals(11, (int) tagFrequencies.get("meta"));
assertEquals(12, (int) tagFrequencies.get("link"));
assertEquals(6, (int) tagFrequencies.get("script"));
}
Expand Down
Expand Up @@ -106,6 +106,8 @@ public void testLatinDetectionHeuristics() throws Exception {
parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UniversalEncodingDetector", metadata.get(TikaCoreProperties.ENCODING_DETECTOR));
assertEquals("windows-1252", metadata.get(TikaCoreProperties.DETECTED_ENCODING));

metadata = new Metadata();
parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),
Expand Down
Expand Up @@ -148,8 +148,9 @@ public void testCharLimitNoThrowOnWriteLimit() throws Exception {

assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));

assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("them to the separation",
assertContains("necessary for one people",
list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("dissolve the political",
list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
}

Expand Down

0 comments on commit 455409b

Please sign in to comment.