/
StandardHtmlEncodingDetector.java
112 lines (100 loc) · 3.95 KB
/
StandardHtmlEncodingDetector.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html.charsetdetector;
import static org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
* An encoding detector that tries to respect the spirit of the HTML spec
* part 12.2.3 "The input byte stream", or at least the part that is compatible with
* the implementation of tika.
* <p>
* https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream
* <p>
* If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata
* when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain
* charset information.
* <p>
* This encoding detector may return null if no encoding is detected.
* It is meant to be used inside a {@link org.apache.tika.detect.CompositeDetector}.
* For instance:
* <pre> {@code
* EncodingDetector detector = new CompositeDetector(
* new StandardHtmlEncodingDetector(),
* new Icu4jEncodingDetector()
* );
* }</pre>
* <p>
*/
public final class StandardHtmlEncodingDetector implements EncodingDetector {
private static final int META_TAG_BUFFER_SIZE = 8192;
@Field
private int markLimit = META_TAG_BUFFER_SIZE;
/**
* Extracts a charset from a Content-Type HTTP header.
*
* @param metadata parser metadata
* @return a charset if there is one specified, or null
*/
private static Charset charsetFromContentType(Metadata metadata) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediatype = MediaType.parse(contentType);
if (mediatype == null) {
return null;
}
String charsetLabel = mediatype.getParameters().get("charset");
return getCharsetByLabel(charsetLabel);
}
@Override
public Charset detect(InputStream input, Metadata metadata) throws IOException {
int limit = getMarkLimit();
input.mark(limit);
// Never read more than the first META_TAG_BUFFER_SIZE bytes
InputStream limitedStream = new BoundedInputStream(input, limit);
PreScanner preScanner = new PreScanner(limitedStream);
// The order of priority for detection is:
// 1. Byte Order Mark
Charset detectedCharset = preScanner.detectBOM();
// 2. Transport-level information (Content-Type HTTP header)
if (detectedCharset == null) {
detectedCharset = charsetFromContentType(metadata);
}
// 3. HTML <meta> tag
if (detectedCharset == null) {
detectedCharset = preScanner.scan();
}
input.reset();
return detectedCharset;
}
public int getMarkLimit() {
return markLimit;
}
/**
* How far into the stream to read for charset detection.
* Default is 8192.
*/
@Field
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
}