Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,13 @@ every detector runs regardless of what the others returned, and the
(≤ 50 bytes).

| 4
| `StandardHtmlEncodingDetector`
| `HtmlEncodingDetector`
| `tika-encoding-detector-html`
| Scans HTML `<meta charset>` / `<meta http-equiv=Content-Type>` tags.
Returns a DECLARATIVE result. Skips BOM detection by default
(`skipBOM=true`) so that `BOMDetector` owns that signal; set `skipBOM=false`
for standalone use without `BOMDetector`.
| Scans HTML `<meta charset>` / `<meta http-equiv=Content-Type>` tags with a
fast lenient regex matcher. Returns a DECLARATIVE result. Applies a
curated subset of WHATWG label aliases (see <<html-charset-aliases>>).
An alternative, spec-strict implementation — `StandardHtmlEncodingDetector`
— is available opt-in for users who need the full WHATWG prescan algorithm.

| 5
| `CharSoupEncodingDetector`
Expand Down Expand Up @@ -503,10 +504,10 @@ Reads the first 4 bytes and detects:
| `FE FF` | UTF-16-BE
|===

Returns a DECLARATIVE result. `StandardHtmlEncodingDetector` skips BOM
detection by default (`skipBOM=true`) so that `BOMDetector` is the sole source
of BOM evidence. This separation allows `CharSoupEncodingDetector` to
arbitrate when a BOM and a `<meta charset>` tag disagree.
Returns a DECLARATIVE result. The HTML detectors do not handle BOMs on their
own: `BOMDetector` is the sole source of BOM evidence, which lets
`CharSoupEncodingDetector` arbitrate when a BOM and a `<meta charset>` tag
disagree.

== Performance and accuracy

Expand Down
20 changes: 11 additions & 9 deletions docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ The default chain when `tika-charset-detectors-core` is on the classpath:
|A UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE byte-order mark is present.

|3
|`standard-html-encoding-detector`
|`html-encoding-detector`
|An HTML `<meta charset="…">` or `Content-Type` http-equiv tag is found
(WHATWG spec prescan algorithm).
(fast lenient regex matcher, curated WHATWG label aliases).

|4
|`ml-encoding-detector`
Expand Down Expand Up @@ -100,9 +100,10 @@ referenced by name in JSON configuration.
|`tika-charset-detectors-core`
|Byte-order mark detection (UTF-8/16/32). In the default chain.

|`standard-html-encoding-detector`
|`html-encoding-detector`
|`tika-charset-detectors-core`
|WHATWG-spec HTML charset prescan. In the default chain.
|Fast lenient regex matcher for `<meta charset>` / `http-equiv` tags, with a
curated subset of WHATWG label aliases. In the default chain.

|`ml-encoding-detector`
|`tika-charset-detectors-core`
Expand All @@ -114,10 +115,11 @@ In the default chain.
|State-machine structural prober; wraps the `com.github.albfernandez:juniversalchardet`
fork. Auto-registers when the module jar is on the classpath.

|`html-encoding-detector`
|`standard-html-encoding-detector`
|`tika-charset-detectors-core`
|Older regex-based HTML meta-charset detector. Not in the default chain
(use `standard-html-encoding-detector` instead).
|Spec-strict WHATWG prescan algorithm. Not in the default chain — opt in
explicitly if you need strict WHATWG tokenisation (e.g. ignoring charset
declarations inside comments or other contexts the lenient regex may match).

|`icu4j-encoding-detector`
|`tika-charset-detectors-icu4j`
Expand Down Expand Up @@ -159,7 +161,7 @@ statistical chain:
"encoding-detectors": [
{"http-header-encoding-detector": {}},
{"bom-encoding-detector": {}},
{"standard-html-encoding-detector": {}},
{"html-encoding-detector": {}},
{"ml-encoding-detector": {}}
]
}
Expand All @@ -177,7 +179,7 @@ large `<script>` blocks before the `<meta charset>` declaration.
{"http-header-encoding-detector": {}},
{"bom-encoding-detector": {}},
{
"standard-html-encoding-detector": {
"html-encoding-detector": {
"markLimit": 65536
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
*
* @since Apache Tika 1.2
*/
@TikaComponent(spi = false)
@TikaComponent(name = "html-encoding-detector")
public class HtmlEncodingDetector implements EncodingDetector {

// TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
Expand Down Expand Up @@ -191,10 +191,10 @@ private Charset findCharset(String s) {
if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
continue;
}
if ("x-user-defined".equalsIgnoreCase(candCharset)) {
candCharset = "windows-1252";
Charset aliased = TikaHtmlCharsetAliases.resolve(candCharset);
if (aliased != null) {
return aliased;
}

if (CharsetUtils.isSupported(candCharset)) {
try {
return CharsetUtils.forName(candCharset);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;

import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

/**
* Curated subset of the WHATWG Encoding Standard label table
* (https://encoding.spec.whatwg.org/) for use by {@link HtmlEncodingDetector}.
*
* <p>The WHATWG table is designed for <em>web browsers</em> where lenient
* decoding with fallbacks is preferable to failing or producing mojibake.
* For a generic content-extraction library the same policy can be
* data-destructive, so this class intentionally departs from the spec in
* three places:
*
* <ol>
* <li><strong>No replacement charset for ISO-2022-KR / ISO-2022-CN /
* HZ-GB-2312.</strong> WHATWG maps these to a dummy "replacement"
* decoder that emits {@code U+FFFD} for every byte. For security in a
* browser this is fine; for Tika it would throw away legitimate text
* in those encodings, so we simply leave the labels unmapped and let
* the downstream detector chain (Mojibuster's structural rules, etc.)
* handle them.</li>
* <li><strong>No ISO-8859-14 / 16 / 10 downgrades.</strong> WHATWG
* collapses these into ISO-8859-1 / ISO-8859-4 because no major
* browser implements them. Java's JDK does, so we let the label
* resolve to the native charset via {@link Charset#forName}.</li>
* <li><strong>{@code windows-949} / {@code MS949} / {@code CP949} →
* {@code x-windows-949} (not {@code EUC-KR}).</strong> Unified Hangul
* Code is a strict superset of EUC-KR — resolving these labels to
* EUC-KR emits {@code U+FFFD} on extension bytes that MS949 decodes
* correctly.</li>
* </ol>
*
* <p>All other WHATWG labels we recognise — including browser-friendly
* aliases like {@code iso-8859-1} → {@code windows-1252}, {@code iso-8859-9}
* → {@code windows-1254}, {@code tis-620} → {@code windows-874}, and the
* naked {@code utf-16} → {@code UTF-16LE} BOM-absent default — match the
* spec exactly.
*/
final class TikaHtmlCharsetAliases {

private static final Map<String, Charset> CHARSETS_BY_LABEL = buildTable();

private TikaHtmlCharsetAliases() {
}

/**
* @param label a charset label from an HTML {@code <meta charset>} or
* {@code Content-Type} attribute
* @return the Java charset this label resolves to, or {@code null} if the
* label is not in the curated alias table (callers should then
* fall back to {@link Charset#forName} with a supported-by-IANA
* check)
*/
static Charset resolve(String label) {
if (label == null) {
return null;
}
return CHARSETS_BY_LABEL.get(label.trim().toLowerCase(Locale.US));
}

private static Map<String, Charset> buildTable() {
Map<String, Charset> m = new HashMap<>();
add(m, charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
add(m, charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
add(m, charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
"ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601");
// windows-949 / MS949 / CP949 are supersets of EUC-KR; route to x-windows-949
// to preserve MS949 extension syllables (see class javadoc).
add(m, charset("x-windows-949"), "windows-949", "ms949", "cp949");
add(m, charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
"gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
add(m, charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
add(m, charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
add(m, charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
add(m, charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
"iso_8859-15", "l9");
add(m, charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
"iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
add(m, charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
"iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
add(m, charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
"iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
add(m, charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5",
"iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
add(m, charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
"csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i",
"iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
add(m, charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek",
"greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7",
"iso_8859-7:1987", "sun_eu_greek");
// ISO-8859-8 (visual order) and ISO-8859-8-I (logical order):
// we do not implement directionality remapping, so both resolve to ISO-8859-8
// where available.
add(m, charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
"iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8",
"iso_8859-8:1988", "visual");
add(m, charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
add(m, charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
add(m, charset("KOI8-U"), "koi8-ru", "koi8-u");
add(m, charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis",
"shift_jis", "sjis", "windows-31j", "x-sjis");
add(m, charset("UTF-16BE"), "utf-16be");
// Naked "utf-16" with no BOM defaults to UTF-16LE per WHATWG.
add(m, charset("UTF-16LE"), "utf-16", "utf-16le");
add(m, charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
add(m, charset("gb18030"), "gb18030");
add(m, charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
add(m, charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
add(m, charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819",
"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591",
"iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252",
"x-cp1252");
add(m, charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
add(m, charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
"iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5",
"windows-1254", "x-cp1254");
add(m, charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
add(m, charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
add(m, charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
add(m, charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
add(m, charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
"tis-620", "windows-874");
add(m, charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
add(m, charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
// x-user-defined is a browser-only passthrough; resolve to windows-1252,
// which mirrors HtmlEncodingDetector's pre-existing behaviour.
add(m, charset("windows-1252"), "x-user-defined");
return m;
}

private static Charset charset(String... names) {
for (String name : names) {
try {
return Charset.forName(name);
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
// try next alternative
}
}
return null;
}

private static void add(Map<String, Charset> m, Charset cs, String... labels) {
if (cs == null) {
return;
}
for (String label : labels) {
m.put(label, cs);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.BitSet;
import java.util.Map;
Expand Down Expand Up @@ -50,9 +49,6 @@ class PreScanner {
private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');

private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
private static final byte LOWER_A = (byte) 'a';
private static final byte LOWER_Z = (byte) 'z';
private static final byte UPPER_A = (byte) 'A';
Expand Down Expand Up @@ -97,19 +93,6 @@ Charset scan() {
return null;
}

Charset detectBOM() {
try {
if (expect(UTF8_BOM)) {
return StandardCharsets.UTF_8;
} else if (expect(UTF16_BE_BOM)) {
return StandardCharsets.UTF_16BE;
} else if (expect(UTF16_LE_BOM)) {
return StandardCharsets.UTF_16LE;
}
} catch (IOException e) { /* stream could not be read, also return null */ }
return null;
}

private boolean processAtLeastOneByte() {
try {
return processComment() || processMeta() || processTag() || processSpecialTag() ||
Expand Down
Loading
Loading