apache · tballison · May 1, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -21,6 +21,7 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 
 import org.xml.sax.InputSource;
@@ -30,9 +31,7 @@
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.CharsetUtils;
 
 /**
  * An input stream reader that automatically detects the character encoding
@@ -108,24 +107,35 @@ private static Charset detect(TikaInputStream tis, Metadata metadata,
             return detected;
         }
 
-        // Try determining the encoding based on hints in document metadata
-        MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
-        if (type != null) {
-            String charsetParam = type.getParameters().get("charset");
-            if (charsetParam != null) {
-                try {
-                    Charset cs = CharsetUtils.forName(charsetParam);
-                    metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
-                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
-                            "AutoDetectReader-charset-metadata-fallback");
-                    return cs;
-                } catch (IllegalArgumentException e) {
-                    // ignore
-                }
-            }
+        // Try determining the encoding based on hints in document metadata.
+        // Two metadata keys are honoured (TIKA-4683 — restoring 3.x parser-layer
+        // behaviour that consulted both): the charset parameter of CONTENT_TYPE
+        // (e.g. "text/html; charset=UTF-8") and a bare charset label in
+        // CONTENT_ENCODING (set by parsers such as RFC822Parser).
+        Charset metaCharset = MetadataCharsetDetector.charsetFromContentType(metadata);
+        if (metaCharset == null) {
+            metaCharset = MetadataCharsetDetector.charsetFromContentEncoding(metadata);
+        }
+        if (metaCharset != null) {
+            metadata.set(TikaCoreProperties.DETECTED_ENCODING, metaCharset.name());
+            metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                    "AutoDetectReader-charset-metadata-fallback");
+            return metaCharset;
         }
 
-        throw new TikaException("Failed to detect the character encoding of a document");
+        // Final fallback (TIKA-4683): when the rolled-back 3.x-style chain
+        // (Html, Universal, Icu4j) abstains on short/pure-ASCII inputs and
+        // metadata carries no charset hint, default to ISO-8859-1 rather
+        // than throwing.  This matches 3.x's default-charset behaviour:
+        // pre-TIKA-4685 the chain effectively returned ISO-8859-1 for
+        // ASCII-only content, and tests assert that.  4.x's TIKA-4685
+        // refactor moved to windows-1252 via WHATWG normalisation; we
+        // explicitly opt out of that here.
+        Charset fallback = StandardCharsets.ISO_8859_1;
+        metadata.set(TikaCoreProperties.DETECTED_ENCODING, fallback.name());
+        metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                "AutoDetectReader-default-fallback");
+        return fallback;
     }
 
     private static TikaInputStream getTikaInputStream(InputStream stream) {

diff --git a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
@@ -44,7 +44,7 @@
  *
  * @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0)
  */
-@TikaComponent
+@TikaComponent(spi = false)
 public class BOMDetector implements EncodingDetector {
 
     private static final ByteOrderMark[] BOMS =

diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
@@ -18,7 +18,9 @@
 
 import java.util.Collection;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import javax.imageio.spi.ServiceRegistry;
 
 import org.apache.tika.config.ServiceLoader;
@@ -28,29 +30,37 @@
  * implementations available through the
  * {@link ServiceRegistry service provider mechanism}.
  *
- * <p>Loaded detectors are sorted in two tiers:
+ * <p>The default chain (Tika 3.x style) runs three detectors in order, with
+ * the first non-empty result winning:
  * <ol>
- *   <li>Base detectors (non-{@link MetaEncodingDetector}) sorted by full
- *       class name (non-Tika before Tika, then ascending alphabetically).
- *       The package ordering guarantees:
- *       {@code org.apache.tika.ml.*} (Mojibuster) →
- *       {@code org.apache.tika.parser.*} (HTML).</li>
- *   <li>{@link MetaEncodingDetector} instances always run last, after all
- *       base detectors have collected their candidates into
- *       {@link EncodingDetectorContext}.</li>
- * </ol></p>
+ *   <li>{@code org.apache.tika.parser.html.HtmlEncodingDetector}</li>
+ *   <li>{@code org.apache.tika.parser.txt.UniversalEncodingDetector}</li>
+ *   <li>{@code org.apache.tika.parser.txt.Icu4jEncodingDetector}</li>
+ * </ol>
+ * Any other {@link EncodingDetector} discovered via SPI (e.g.,
+ * user-supplied detectors) runs after the three blessed detectors,
+ * preserving back-compat for callers who add their own.</p>
  *
  * <p>If you need to control the order of the Detectors explicitly, construct
  * your own {@link CompositeEncodingDetector} and pass in the list in the
  * required order.</p>
  *
- * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate)
- * is provided by {@link CompositeEncodingDetector}.</p>
- *
  * @since Apache Tika 1.15
  */
 public class DefaultEncodingDetector extends CompositeEncodingDetector {
 
+    /** Pinned ordering for the 3.x-style default chain. Detectors not on this
+     *  map keep their natural SPI load order behind the three blessed ones. */
+    private static final Map<String, Integer> PRIORITY = buildPriority();
+
+    private static Map<String, Integer> buildPriority() {
+        Map<String, Integer> p = new HashMap<>();
+        p.put("org.apache.tika.parser.html.HtmlEncodingDetector", 0);
+        p.put("org.apache.tika.parser.txt.UniversalEncodingDetector", 1);
+        p.put("org.apache.tika.parser.txt.Icu4jEncodingDetector", 2);
+        return p;
+    }
+
     public DefaultEncodingDetector() {
         this(new ServiceLoader(DefaultEncodingDetector.class.getClassLoader()));
     }
@@ -67,11 +77,13 @@ public DefaultEncodingDetector(ServiceLoader loader,
     }
 
     private static List<EncodingDetector> sorted(List<EncodingDetector> detectors) {
-        // Two-key sort: base detectors first (meta=0) then MetaEncodingDetectors (meta=1),
-        // within each tier sorted by full class name for stability across JARs.
+        // Pin the 3.x default chain (html, universal, icu4j) to fixed
+        // positions; other detectors fall to the end with stable secondary
+        // ordering by class name.
         detectors.sort(Comparator
                 .<EncodingDetector, Integer>comparing(
-                        d -> (d instanceof MetaEncodingDetector) ? 1 : 0)
+                        d -> PRIORITY.getOrDefault(
+                                d.getClass().getName(), Integer.MAX_VALUE))
                 .thenComparing(d -> d.getClass().getName()));
         return detectors;
     }

diff --git a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -49,7 +49,7 @@
  *
  * @since Apache Tika 4.0
  */
-@TikaComponent(name = "metadata-charset-detector")
+@TikaComponent(spi = false, name = "metadata-charset-detector")
 public class MetadataCharsetDetector implements EncodingDetector {
 
     @Override

diff --git a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -13,10 +13,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-# org.apache.tika.detect.* sorts before other detector namespaces so BOM and
-# HTTP/MIME DECLARATIVE evidence reaches JunkFilterEncodingDetector before any
-# statistical detector runs.  Class-name order: BOMDetector first, then
-# MetadataCharsetDetector.
-org.apache.tika.detect.BOMDetector
-org.apache.tika.detect.MetadataCharsetDetector
+# Intentionally empty: tika-core itself does not register any default
+# EncodingDetector implementations. The default chain is provided by the
+# tika-encoding-detector-html, tika-encoding-detector-universal, and
+# tika-encoding-detector-icu4j modules and is sequenced by
+# DefaultEncodingDetector.
 
diff --git a/...etector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/...etector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -12,4 +12,4 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
+org.apache.tika.parser.html.HtmlEncodingDetector
diff --git a/...tector-icu4j/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/...tector-icu4j/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.parser.txt.Icu4jEncodingDetector
diff --git a/...or-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/...or-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -64,7 +64,7 @@
  * candidate.  Conservative: only return at a layer when that layer's
  * structural check is clean.</p>
  */
-@TikaComponent(name = "mojibuster-encoding-detector")
+@TikaComponent(spi = false, name = "mojibuster-encoding-detector")
 public class MojibusterEncodingDetector implements EncodingDetector {
 
     /** Default NB bigram model on the classpath. */

diff --git a/...r-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/...r-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -13,4 +13,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.tika.ml.chardetect.MojibusterEncodingDetector
+# Intentionally empty: MojibusterEncodingDetector is no longer part of the
+# default Tika encoding-detection chain. Users who want it must register it
+# explicitly via tika-config.
diff --git a/...ojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java b/...ojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
@@ -22,6 +22,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.List;
 
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.detect.DefaultEncodingDetector;
@@ -53,6 +54,7 @@ public class SparseLatinVcardRegressionTest {
      * (windows-1257, IBM852, etc.) is a documented sibling-arbitration
      * limitation; only the catastrophic case is asserted here.
      */
+    @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
     @Test
     public void sparseLatinVcardDoesNotDetectAsIbm424() throws Exception {
         byte[] probe = buildSparseLatinVcard();

diff --git a/...ctor-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java b/...ctor-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
@@ -20,6 +20,7 @@
 
 import java.util.List;
 
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.detect.DefaultEncodingDetector;
@@ -58,6 +59,7 @@ private static byte[] hexToBytes(String hex) {
      * sequentially on two entries differing only in byte 5 (0x31 vs 0x32), simulating
      * what ZipParser does when iterating entries with the same ParseContext.
      */
+    @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
     @Test
     public void fullPipelineDetectsBothSjisEntries() throws Exception {
         DefaultEncodingDetector detector = new DefaultEncodingDetector();
@@ -77,11 +79,8 @@ public void fullPipelineDetectsBothSjisEntries() throws Exception {
 
     /**
      * Full pipeline should detect GBK-encoded entry names as GB18030.
-     * Disabled: CharSoup's discriminative language model picks KOI8-U over GB18030
-     * on short probes because the GBK bytes happen to score as Cyrillic.
-     * Re-enable once generative language models are in place (better calibrated
-     * confidence will let CharSoup correctly abstain on cross-script ambiguity).
      */
+    @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j); Mojibuster no longer in default chain.")
     @Test
     public void fullPipelineDetectsGbkEntry() throws Exception {
         DefaultEncodingDetector detector = new DefaultEncodingDetector();

diff --git a/...or-universal/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/...or-universal/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.parser.txt.UniversalEncodingDetector
diff --git a/...ntegration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/...ntegration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
@@ -497,7 +497,8 @@ private Path getPluginsConfig(Path pipesDirectory, OpenSearchEmitterConfig.Attac
     private void createTestHtmlFiles(String bodyContent, int numHtmlDocs, Path testDocDirectory) throws Exception {
         Files.createDirectories(testDocDirectory);
         for (int i = 0; i < numHtmlDocs; ++i) {
-            String html = "<html><body>" + bodyContent +  "</body></html>";
+            String html = "<html><head><meta charset=\"UTF-8\"></head><body>" + bodyContent +
+                    "</body></html>";
             Path p = testDocDirectory.resolve( "test-" + i + ".html");
             writeStringToPath(p, html);
         }

diff --git a/...integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/...integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
@@ -88,8 +88,9 @@ private void createTestFiles() throws NoSuchAlgorithmException {
         for (int i = 0; i < numDocs; ++i) {
             String nextFileName = "test-" + i + ".html";
             testFiles.add(nextFileName);
-            String s = "<html><body>body-of-" + nextFileName + "</body></html>";
-            byte[] bytes = s.getBytes(StandardCharsets.US_ASCII);
+            String s = "<html><head><meta charset=\"UTF-8\"></head><body>body-of-" +
+                    nextFileName + "</body></html>";
+            byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
             PutObjectRequest request = PutObjectRequest.builder().bucket(FETCH_BUCKET).key(nextFileName).build();
             RequestBody requestBody = RequestBody.fromBytes(bytes);
             s3Client.putObject(request, requestBody);

diff --git a/...ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/...ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -63,7 +63,7 @@
  *
  * @since Apache Tika 4.0.0 (TIKA-4720)
  */
-@TikaComponent(name = "junk-filter-encoding-detector")
+@TikaComponent(spi = false, name = "junk-filter-encoding-detector")
 public class JunkFilterEncodingDetector implements MetaEncodingDetector {
 
     private static final long serialVersionUID = 1L;