Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.xml.sax.InputSource;
Expand All @@ -30,9 +31,7 @@
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.CharsetUtils;

/**
* An input stream reader that automatically detects the character encoding
Expand Down Expand Up @@ -108,24 +107,35 @@ private static Charset detect(TikaInputStream tis, Metadata metadata,
return detected;
}

// Try determining the encoding based on hints in document metadata
MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
if (type != null) {
String charsetParam = type.getParameters().get("charset");
if (charsetParam != null) {
try {
Charset cs = CharsetUtils.forName(charsetParam);
metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
"AutoDetectReader-charset-metadata-fallback");
return cs;
} catch (IllegalArgumentException e) {
// ignore
}
}
// Try determining the encoding based on hints in document metadata.
// Two metadata keys are honoured (TIKA-4683 — restoring 3.x parser-layer
// behaviour that consulted both): the charset parameter of CONTENT_TYPE
// (e.g. "text/html; charset=UTF-8") and a bare charset label in
// CONTENT_ENCODING (set by parsers such as RFC822Parser).
Charset metaCharset = MetadataCharsetDetector.charsetFromContentType(metadata);
if (metaCharset == null) {
metaCharset = MetadataCharsetDetector.charsetFromContentEncoding(metadata);
}
if (metaCharset != null) {
metadata.set(TikaCoreProperties.DETECTED_ENCODING, metaCharset.name());
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
"AutoDetectReader-charset-metadata-fallback");
return metaCharset;
}

throw new TikaException("Failed to detect the character encoding of a document");
// Final fallback (TIKA-4683): when the rolled-back 3.x-style chain
// (Html, Universal, Icu4j) abstains on short/pure-ASCII inputs and
// metadata carries no charset hint, default to ISO-8859-1 rather
// than throwing. This matches 3.x's default-charset behaviour:
// pre-TIKA-4685 the chain effectively returned ISO-8859-1 for
// ASCII-only content, and tests assert that. 4.x's TIKA-4685
// refactor moved to windows-1252 via WHATWG normalisation; we
// explicitly opt out of that here.
Charset fallback = StandardCharsets.ISO_8859_1;
metadata.set(TikaCoreProperties.DETECTED_ENCODING, fallback.name());
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
"AutoDetectReader-default-fallback");
return fallback;
}

private static TikaInputStream getTikaInputStream(InputStream stream) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
*
* @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0)
*/
@TikaComponent
@TikaComponent(spi = false)
public class BOMDetector implements EncodingDetector {

private static final ByteOrderMark[] BOMS =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.imageio.spi.ServiceRegistry;

import org.apache.tika.config.ServiceLoader;
Expand All @@ -28,29 +30,37 @@
* implementations available through the
* {@link ServiceRegistry service provider mechanism}.
*
* <p>Loaded detectors are sorted in two tiers:
* <p>The default chain (Tika 3.x style) runs three detectors in order, with
* the first non-empty result winning:
* <ol>
* <li>Base detectors (non-{@link MetaEncodingDetector}) sorted by full
* class name (non-Tika before Tika, then ascending alphabetically).
* The package ordering guarantees:
* {@code org.apache.tika.ml.*} (Mojibuster) →
* {@code org.apache.tika.parser.*} (HTML).</li>
* <li>{@link MetaEncodingDetector} instances always run last, after all
* base detectors have collected their candidates into
* {@link EncodingDetectorContext}.</li>
* </ol></p>
* <li>{@code org.apache.tika.parser.html.HtmlEncodingDetector}</li>
* <li>{@code org.apache.tika.parser.txt.UniversalEncodingDetector}</li>
* <li>{@code org.apache.tika.parser.txt.Icu4jEncodingDetector}</li>
* </ol>
* Any other {@link EncodingDetector} discovered via SPI (e.g.,
* user-supplied detectors) runs after the three blessed detectors,
* preserving back-compat for callers who add their own.</p>
*
* <p>If you need to control the order of the Detectors explicitly, construct
* your own {@link CompositeEncodingDetector} and pass in the list in the
* required order.</p>
*
* <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate)
* is provided by {@link CompositeEncodingDetector}.</p>
*
* @since Apache Tika 1.15
*/
public class DefaultEncodingDetector extends CompositeEncodingDetector {

/** Pinned ordering for the 3.x-style default chain. Detectors not on this
* map keep their natural SPI load order behind the three blessed ones. */
private static final Map<String, Integer> PRIORITY = buildPriority();

private static Map<String, Integer> buildPriority() {
Map<String, Integer> p = new HashMap<>();
p.put("org.apache.tika.parser.html.HtmlEncodingDetector", 0);
p.put("org.apache.tika.parser.txt.UniversalEncodingDetector", 1);
p.put("org.apache.tika.parser.txt.Icu4jEncodingDetector", 2);
return p;
}

public DefaultEncodingDetector() {
this(new ServiceLoader(DefaultEncodingDetector.class.getClassLoader()));
}
Expand All @@ -67,11 +77,13 @@ public DefaultEncodingDetector(ServiceLoader loader,
}

private static List<EncodingDetector> sorted(List<EncodingDetector> detectors) {
// Two-key sort: base detectors first (meta=0) then MetaEncodingDetectors (meta=1),
// within each tier sorted by full class name for stability across JARs.
// Pin the 3.x default chain (html, universal, icu4j) to fixed
// positions; other detectors fall to the end with stable secondary
// ordering by class name.
detectors.sort(Comparator
.<EncodingDetector, Integer>comparing(
d -> (d instanceof MetaEncodingDetector) ? 1 : 0)
d -> PRIORITY.getOrDefault(
d.getClass().getName(), Integer.MAX_VALUE))
.thenComparing(d -> d.getClass().getName()));
return detectors;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
*
* @since Apache Tika 4.0
*/
@TikaComponent(name = "metadata-charset-detector")
@TikaComponent(spi = false, name = "metadata-charset-detector")
public class MetadataCharsetDetector implements EncodingDetector {

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# org.apache.tika.detect.* sorts before other detector namespaces so BOM and
# HTTP/MIME DECLARATIVE evidence reaches JunkFilterEncodingDetector before any
# statistical detector runs. Class-name order: BOMDetector first, then
# MetadataCharsetDetector.
org.apache.tika.detect.BOMDetector
org.apache.tika.detect.MetadataCharsetDetector
# Intentionally empty: tika-core itself does not register any default
# EncodingDetector implementations. The default chain is provided by the
# tika-encoding-detector-html, tika-encoding-detector-universal, and
# tika-encoding-detector-icu4j modules and is sequenced by
# DefaultEncodingDetector.

Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
org.apache.tika.parser.html.HtmlEncodingDetector
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.tika.parser.txt.Icu4jEncodingDetector
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
* candidate. Conservative: only return at a layer when that layer's
* structural check is clean.</p>
*/
@TikaComponent(name = "mojibuster-encoding-detector")
@TikaComponent(spi = false, name = "mojibuster-encoding-detector")
public class MojibusterEncodingDetector implements EncodingDetector {

/** Default NB bigram model on the classpath. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

org.apache.tika.ml.chardetect.MojibusterEncodingDetector
# Intentionally empty: MojibusterEncodingDetector is no longer part of the
# default Tika encoding-detection chain. Users who want it must register it
# explicitly via tika-config.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.detect.DefaultEncodingDetector;
Expand Down Expand Up @@ -53,6 +54,7 @@ public class SparseLatinVcardRegressionTest {
* (windows-1257, IBM852, etc.) is a documented sibling-arbitration
* limitation; only the catastrophic case is asserted here.
*/
@Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
@Test
public void sparseLatinVcardDoesNotDetectAsIbm424() throws Exception {
byte[] probe = buildSparseLatinVcard();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.util.List;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.detect.DefaultEncodingDetector;
Expand Down Expand Up @@ -58,6 +59,7 @@ private static byte[] hexToBytes(String hex) {
* sequentially on two entries differing only in byte 5 (0x31 vs 0x32), simulating
* what ZipParser does when iterating entries with the same ParseContext.
*/
@Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
@Test
public void fullPipelineDetectsBothSjisEntries() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
Expand All @@ -77,11 +79,8 @@ public void fullPipelineDetectsBothSjisEntries() throws Exception {

/**
* Full pipeline should detect GBK-encoded entry names as GB18030.
* Disabled: CharSoup's discriminative language model picks KOI8-U over GB18030
* on short probes because the GBK bytes happen to score as Cyrillic.
* Re-enable once generative language models are in place (better calibrated
* confidence will let CharSoup correctly abstain on cross-script ambiguity).
*/
@Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
@Test
public void fullPipelineDetectsGbkEntry() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.tika.parser.txt.UniversalEncodingDetector
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ private Path getPluginsConfig(Path pipesDirectory, OpenSearchEmitterConfig.Attac
private void createTestHtmlFiles(String bodyContent, int numHtmlDocs, Path testDocDirectory) throws Exception {
Files.createDirectories(testDocDirectory);
for (int i = 0; i < numHtmlDocs; ++i) {
String html = "<html><body>" + bodyContent + "</body></html>";
String html = "<html><head><meta charset=\"UTF-8\"></head><body>" + bodyContent +
"</body></html>";
Path p = testDocDirectory.resolve( "test-" + i + ".html");
writeStringToPath(p, html);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ private void createTestFiles() throws NoSuchAlgorithmException {
for (int i = 0; i < numDocs; ++i) {
String nextFileName = "test-" + i + ".html";
testFiles.add(nextFileName);
String s = "<html><body>body-of-" + nextFileName + "</body></html>";
byte[] bytes = s.getBytes(StandardCharsets.US_ASCII);
String s = "<html><head><meta charset=\"UTF-8\"></head><body>body-of-" +
nextFileName + "</body></html>";
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
PutObjectRequest request = PutObjectRequest.builder().bucket(FETCH_BUCKET).key(nextFileName).build();
RequestBody requestBody = RequestBody.fromBytes(bytes);
s3Client.putObject(request, requestBody);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
*
* @since Apache Tika 4.0.0 (TIKA-4720)
*/
@TikaComponent(name = "junk-filter-encoding-detector")
@TikaComponent(spi = false, name = "junk-filter-encoding-detector")
public class JunkFilterEncodingDetector implements MetaEncodingDetector {

private static final long serialVersionUID = 1L;
Expand Down
Loading
Loading