apache · tballison · Apr 17, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 15, 2026
diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
@@ -91,8 +91,23 @@ Each `EncodingResult` carries:
 
 | `DECLARATIVE`
 | Explicit charset declaration: BOM, HTML `<meta>` tag, HTTP Content-Type
-  header, or metadata hint.  Should be respected over statistical inferences
-  unless structurally impossible.
+  header, or metadata hint.
++
+*Important — declared charsets are NOT trusted by default.* When
+`CharSoupEncodingDetector` is in the chain (the default configuration),
+DECLARATIVE candidates are treated as one input among several and are
+arbitrated by language signal alongside STATISTICAL and STRUCTURAL
+candidates.  This is deliberate: real-world declarations are notoriously
+unreliable — sites serve `windows-1252` and declare `ISO-8859-1`, serve
+`UTF-8` and declare `ASCII`, copy-paste templates from other regions
+without updating the meta tag, and so on.  Tika's stance is that
+language signal over the actual decoded bytes is more trustworthy than
+a declaration on the wire.
++
+If you want declared charsets to be authoritative (e.g. you trust your
+input pipeline, or you specifically want HTML5-spec-compliant behaviour),
+configure your detector chain *without* `CharSoupEncodingDetector` —
+see <<opting-out-of-arbitration>>.
 
 | `STRUCTURAL`
 | Derived from byte-level structure (UTF-8 validity, EBCDIC space distribution).
@@ -271,6 +286,25 @@ chain switches `CompositeEncodingDetector` into collect-all mode.  After all
 other detectors run, CharSoup receives the full `EncodingDetectorContext` and
 arbitrates.
 
+[IMPORTANT]
+====
+*CharSoup intentionally arbitrates over ALL candidates, including
+DECLARATIVE ones.*  A `<meta charset>` tag, HTTP `Content-Type` charset
+parameter, or other declared charset is treated as one input among many
+— not as authoritative.  Real-world declarations on the legacy web are
+notoriously unreliable (sites declare ASCII while serving UTF-8, declare
+ISO-8859-1 while serving windows-1252, copy-paste templates from other
+regions and forget to update the meta tag, etc.).  CharSoup's stance:
+language signal over the actual decoded bytes is more trustworthy than
+the wire declaration.
+
+If you want declared charsets to be authoritative — for example because
+you trust your input pipeline, or you specifically need HTML5
+spec-compliant behaviour — *opt out of CharSoup* (see
+<<opting-out-of-arbitration>>).  This is a configuration choice, not a
+limitation.
+====
+
 Before any charset decoding, CharSoup strips leading BOM bytes from the raw
 probe.  This ensures every candidate charset decodes the same content bytes,
 preventing the BOM itself from skewing language scores.
@@ -306,6 +340,48 @@ false positives from truly lying BOMs or wrong `<meta charset>` tags.
   statistical winner; otherwise it returns the first candidate from the
   highest-confidence statistical detector.
 
+[[opting-out-of-arbitration]]
+=== Opting out — strict declared-charset honoring
+
+If your application needs declared charsets to be authoritative, omit
+`CharSoupEncodingDetector` from the encoding-detector chain.  Without
+CharSoup, `CompositeEncodingDetector` runs in classic
+"first-detector-with-a-result wins" mode.  A typical declared-charset-honoring
+configuration:
+
+[source,json]
+----
+{
+  "encoding-detectors": [
+    { "bom-detector": {} },
+    { "metadata-charset-detector": {} },
+    { "standard-html-encoding-detector": {} },
+    { "mojibuster-encoding-detector": {} }
+  ]
+}
+----
+
+In this chain:
+
+* `BOMDetector` returns DECLARATIVE on a recognised BOM.
+* `MetadataCharsetDetector` returns DECLARATIVE from HTTP/MIME headers.
+* `StandardHtmlEncodingDetector` returns DECLARATIVE from `<meta charset>` /
+  `<meta http-equiv>` tags.
+* `MojibusterEncodingDetector` runs only when none of the above produced a
+  declaration, and its STATISTICAL result is final (no language-signal
+  arbitration to second-guess it).
+
+This is HTML5-spec-compliant for the declaration cases and matches the
+behaviour callers familiar with Tika 2.x and earlier expect.  The
+trade-off is that lying declarations (e.g. a Korean MS949 page that
+declares `Windows-949` correctly but where Mojibuster's statistical
+output would have rescued a misdeclaration) propagate unfiltered.
+
+Conversely, the default chain (with CharSoup) tolerates lying
+declarations at the cost of occasionally overriding a correct one when
+the language signal is ambiguous.  Pick the trade-off that matches your
+deployment.
+
 [[thai-gbk-case-study]]
 === Case study: why top-N limiting and the generative model matter
 

diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-eval-20260417.txt b/docs/modules/ROOT/pages/advanced/charset-detection-eval-20260417.txt
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -99,9 +99,14 @@ private static Charset detect(TikaInputStream tis, Metadata metadata,
         // Ask all given detectors for the character encoding
         List<EncodingResult> results = detector.detect(tis, metadata, new ParseContext());
         if (!results.isEmpty()) {
-            return results.get(0).getCharset();
+            Charset detected = results.get(0).getCharset();
+            Charset superset = CharsetSupersets.supersetOf(detected);
+            if (superset != null) {
+                metadata.set(TikaCoreProperties.DECODED_CHARSET, superset.name());
+                return superset;
+            }
+            return detected;
         }
-        Charset charset = null;
 
         // Try determining the encoding based on hints in document metadata
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));

diff --git a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Maps detected charsets to safer superset charsets for decoding.
+ *
+ * <p>When Tika detects a charset that is a strict subset of a broader encoding,
+ * it is safer to decode with the superset — the superset handles all byte
+ * sequences the subset can produce, plus the extension characters the subset
+ * cannot represent. Decoding with only the subset risks mojibake on any
+ * extension characters present in the document.</p>
+ *
+ * <p>Policy: Content-Type and detected-encoding metadata report the <em>detected</em>
+ * charset. Actual stream decoding uses the superset. The superset used is recorded
+ * in {@link org.apache.tika.metadata.TikaCoreProperties#DECODED_CHARSET}.</p>
+ *
+ * <h3>Superset map</h3>
+ * <ul>
+ *   <li>EUC-KR → x-windows-949 (MS949 is a strict superset: all EUC-KR byte sequences
+ *       decode identically, extension chars in x-windows-949 would mojibake under EUC-KR)</li>
+ *   <li>Big5 → Big5-HKSCS (HKSCS adds Hong Kong Supplementary Characters)</li>
+ *   <li>GB2312 → GB18030 (GB18030 is a strict superset of both GB2312 and GBK)</li>
+ *   <li>GBK → GB18030 (GB18030 is a strict superset; enables 4-byte extension sequences)</li>
+ *   <li>Shift_JIS → windows-31j (MS932 is a strict superset with NEC/IBM extensions)</li>
+ * </ul>
+ */
+public final class CharsetSupersets {
+
+    /**
+     * Maps detected charset canonical names (case-sensitive, as returned by
+     * {@link Charset#name()}) to their superset charset canonical name.
+     */
+    public static final Map<String, String> SUPERSET_MAP;
+
+    static {
+        Map<String, String> m = new HashMap<>();
+        m.put("EUC-KR",    "x-windows-949");
+        m.put("Big5",      "Big5-HKSCS");
+        m.put("GB2312",    "GB18030");
+        m.put("GBK",       "GB18030");
+        m.put("Shift_JIS", "windows-31j");
+        SUPERSET_MAP = Collections.unmodifiableMap(m);
+    }
+
+    private CharsetSupersets() {
+    }
+
+    /**
+     * Returns the superset charset to use for decoding, or {@code null} if
+     * {@code detected} has no superset override.
+     *
+     * @param detected the charset returned by the encoding detector
+     * @return superset charset, or {@code null} if none is defined
+     */
+    public static Charset supersetOf(Charset detected) {
+        if (detected == null) {
+            return null;
+        }
+        String supersetName = SUPERSET_MAP.get(detected.name());
+        if (supersetName == null) {
+            return null;
+        }
+        try {
+            return Charset.forName(supersetName);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -437,6 +437,18 @@ public interface TikaCoreProperties {
     Property ENCODING_DETECTION_TRACE =
             Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace");
 
+    /**
+     * The charset actually used to decode the stream when a superset override was applied.
+     * When the detected encoding (reported in Content-Type and {@link #DETECTED_ENCODING}) is
+     * a subset of a safer, broader charset (e.g. EUC-KR is a subset of x-windows-949, or
+     * GB2312 is a subset of GB18030), Tika decodes using the superset charset to avoid
+     * mojibake on extension characters. This field records the superset charset name so
+     * callers know which codec was actually used. Absent when detection and decoding use
+     * the same charset.
+     */
+    Property DECODED_CHARSET =
+            Property.externalText(TIKA_META_PREFIX + "decodedCharset");
+
     /**
      * General metadata key for the count of non-final versions available within a file.  This
      * was added initially to support generalizing incremental updates in PDF.

diff --git a/...-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/...-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -190,7 +190,7 @@ private Charset arbitrate(TikaInputStream tis,
 
         Map<Charset, String> candidates = new LinkedHashMap<>();
         for (Charset candidate : uniqueCharsets) {
-            candidates.put(candidate, stripTags(decode(bytes, candidate)));
+            candidates.put(candidate, HtmlStripper.strip(decode(bytes, candidate)));
         }
 
         CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
@@ -449,26 +449,6 @@ static String decode(byte[] bytes, Charset charset) {
         return cb.toString();
     }
 
-    /**
-     * Simple tag stripping: removes &lt;...&gt; sequences so that
-     * HTML/XML tag names and attributes don't pollute language scoring.
-     */
-    static String stripTags(String text) {
-        StringBuilder sb = new StringBuilder(text.length());
-        boolean inTag = false;
-        for (int i = 0; i < text.length(); i++) {
-            char c = text.charAt(i);
-            if (c == '<') {
-                inTag = true;
-            } else if (c == '>') {
-                inTag = false;
-            } else if (!inTag) {
-                sb.append(c);
-            }
-        }
-        return sb.toString();
-    }
-
     public int getReadLimit() {
         return readLimit;
     }