Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

TIKA-431: Tika currently misuses the HTTP Content-Encoding header, an…

…d does not seem to use the charset part of the Content-Type header properly.

Make text and html parsers return character encoding as a charset parameter in the content type metadata field

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit bbde395d7f08f047b674198ccb90d88b49e662b2 1 parent ec90610
@jukka jukka authored
Showing with 325 additions and 132 deletions.
  1. +2 −0  .gitattributes
  2. +9 −0 CHANGES.txt
  3. +0 −2  tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
  4. +11 −23 tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
  5. +133 −0 tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
  6. +25 −0 tika-core/src/main/java/org/apache/tika/mime/MediaType.java
  7. +1 −1  tika-core/src/main/java/org/apache/tika/mime/package-info.java
  8. +3 −3 tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
  9. +3 −3 tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
  10. +9 −5 tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
  11. +5 −11 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
  12. +10 −12 tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
  13. +22 −11 tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
  14. +0 −32 tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
  15. +26 −0 tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
  16. +4 −4 tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
  17. +1 −1  tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
  18. +61 −24 tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
View
2  .gitattributes
@@ -1,2 +1,4 @@
tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
+tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
+tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf
View
9 CHANGES.txt
@@ -43,6 +43,15 @@ Release 1.2 - Current Development
ICU4J algorithms are still used as a fallback thanks to their wider
coverage of custom character encodings. (TIKA-322, TIKA-471)
+ * Charset parameter: Related to the character encoding improvements
+ mentioned above, Tika now returns the detected character encoding as
+ a "charset" parameter of the content type metadata field for text/plain
+ and text/html documents. For example, instead of just "text/plain", the
+ returned content type will be something like "text/plain; charset=UTF-8"
+ for a UTF-8 encoded text document. Character encoding information is still
+ present also in the content encoding metadata field for backwards
+ compatibility, but that field should be considered deprecated. (TIKA-431)
+
* Extraction of embedded resources from OLE2 Office Documents, where
the resource isn't another office document, has been fixed (TIKA-948)
View
2  tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -26,10 +26,8 @@
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.regex.Pattern;
/**
View
34 tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
@@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata)
input.mark(bytesToTest);
try {
- int chars = 0;
- int controls = 0;
- int asciis = 0;
- int ch = input.read();
- while (ch != -1 && chars < bytesToTest) {
- if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
- controls++;
- } else if (ch < 127) {
- asciis++;
- }
- ch = input.read();
- chars++;
+ TextStatistics stats = new TextStatistics();
+
+ byte[] buffer = new byte[1024];
+ int n = 0;
+ int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
+ while (m != -1 && n < bytesToTest) {
+ stats.addData(buffer, 0, m);
+ n += m;
+ m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
}
- if (chars == 0) {
- // Empty document, so treat it as binary
- // See https://issues.apache.org/jira/browse/TIKA-483
- return MediaType.OCTET_STREAM;
- } else if (controls == 0) {
- // No control characters, so treat it as text
- return MediaType.TEXT_PLAIN;
- } else if (controls < chars * 2 / 100
- && asciis > chars * 90 / 100) {
- // Almost plain text (< 2% control, > 90% ASCII range)
- // See https://issues.apache.org/jira/browse/TIKA-688
+
+ if (stats.isMostlyAscii()) {
return MediaType.TEXT_PLAIN;
} else {
return MediaType.OCTET_STREAM;
View
133 tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Utility class for computing a histogram of the bytes seen in a stream.
+ *
+ * @since Apache Tika 1.2
+ */
+public class TextStatistics {
+
+ private final int[] counts = new int[256];
+
+ private int total = 0;
+
+ public void addData(byte[] buffer, int offset, int length) {
+ for (int i = 0; i < length; i++) {
+ counts[buffer[offset + i] & 0xff]++;
+ total++;
+ }
+ }
+
+ /**
+ * Checks whether at least one byte was seen and that the bytes that
+ * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
+ * @return <code>true</code> if the seen bytes were mostly safe ASCII,
+ * <code>false</code> otherwise
+ */
+ public boolean isMostlyAscii() {
+ int control = count(0, 0x20);
+ int ascii = count(0x20, 128);
+ int safe = countSafeControl();
+ return total > 0
+ && (control - safe) * 100 < total * 2
+ && (ascii + safe) * 100 > total * 90;
+ }
+
+ /**
+ * Returns the total number of bytes seen so far.
+ *
+ * @return count of all bytes
+ */
+ public int count() {
+ return total;
+ }
+
+ /**
+ * Returns the number of occurrences of the given byte.
+ *
+ * @param b byte
+ * @return count of the given byte
+ */
+ public int count(int b) {
+ return counts[b & 0xff];
+ }
+
+ /**
+ * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
+ * page feed and escape).
+ * <p>
+ * This definition of control characters is based on section 4 of the
+ * "Content-Type Processing Model" Internet-draft
+ * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * >draft-abarth-mime-sniff-01</a>).
+ * <pre>
+ * +-------------------------+
+ * | Binary data byte ranges |
+ * +-------------------------+
+ * | 0x00 -- 0x08 |
+ * | 0x0B |
+ * | 0x0E -- 0x1A |
+ * | 0x1C -- 0x1F |
+ * +-------------------------+
+ * </pre>
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+ * @return count of control characters
+ */
+ public int countControl() {
+ return count(0, 0x20) - countSafeControl();
+ }
+
+ /**
+ * Counts "safe" (i.e. seven-bit non-control) ASCII characters.
+ *
+ * @see #countControl()
+ * @return count of safe ASCII characters
+ */
+ public int countSafeAscii() {
+ return count(0x20, 128) + countSafeControl();
+ }
+
+ /**
+ * Counts eight bit characters, i.e. bytes with their highest bit set.
+ *
+ * @return count of eight bit characters
+ */
+ public int countEightBit() {
+ return count(128, 256);
+ }
+
+ private int count(int from, int to) {
+ assert 0 <= from && to < counts.length;
+ int count = 0;
+ for (int i = from; i < to; i++) {
+ count += counts[i];
+ }
+ return count;
+ }
+
+ private int countSafeControl() {
+ return count('\t') + count('\n') + count('\r') // tab, LF, CR
+ + count(0x0c) + count(0x1b); // new page, escape
+ }
+
+}
View
25 tika-core/src/main/java/org/apache/tika/mime/MediaType.java
@@ -17,6 +17,7 @@
package org.apache.tika.mime;
import java.io.Serializable;
+import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -72,6 +73,8 @@
public static final MediaType TEXT_PLAIN = parse("text/plain");
+ public static final MediaType TEXT_HTML = parse("text/html");
+
public static final MediaType APPLICATION_XML = parse("application/xml");
public static final MediaType APPLICATION_ZIP = parse("application/zip");
@@ -346,6 +349,28 @@ public MediaType(MediaType type, Map<String, String> parameters) {
}
/**
+ * Creates a media type by adding a parameter to a base type.
+ *
+ * @param type base type
+ * @param name parameter name
+ * @param value parameter value
+ * @since Apache Tika 1.2
+ */
+ public MediaType(MediaType type, String name, String value) {
+ this(type, Collections.singletonMap(name, value));
+ }
+
+ /**
+ * Creates a media type by adding the "charset" parameter to a base type.
+ *
+ * @param type base type
+ * @param charset charset value
+ * @since Apache Tika 1.2
+ */
+ public MediaType(MediaType type, Charset charset) {
+ this(type, "charset", charset.name());
+ }
+ /**
* Returns the base form of the MediaType, excluding
* any parameters, such as "text/plain" for
* "text/plain; charset=utf-8"
View
2  tika-core/src/main/java/org/apache/tika/mime/package-info.java
@@ -18,5 +18,5 @@
/**
* Media type information.
*/
-@aQute.bnd.annotation.Version("1.0.0")
+@aQute.bnd.annotation.Version("1.2.0")
package org.apache.tika.mime;
View
6 tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
@@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception {
public void testDetectText() throws Exception {
assertText("Hello, World!".getBytes("UTF-8"));
assertText(" \t\r\n".getBytes("UTF-8"));
- assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
+ assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
assertNotText(new byte[] { 0 });
assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
byte[] data = new byte[512];
Arrays.fill(data, (byte) '.');
assertText(data);
- Arrays.fill(data, 100, 109, (byte) 0x1f);
- assertText(data); // almost text
Arrays.fill(data, 100, 110, (byte) 0x1f);
+ assertText(data); // almost text
+ Arrays.fill(data, 100, 111, (byte) 0x1f);
assertNotText(data); // no longer almost text, too many control chars
Arrays.fill(data, (byte) 0x1f);
assertNotText(data);
View
6 tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -67,13 +67,13 @@ public void testDetection() throws Exception {
public void testByteOrderMark() throws Exception {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
- new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")),
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
- new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")),
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
- new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")),
+ new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
new Metadata()));
}
View
14 tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
@@ -57,7 +58,7 @@
new ServiceLoader(HtmlParser.class.getClassLoader());
/**
- * HTML schema singleton used to amortize the heavy instantiation time.
+ * HTML schema singleton used to amortise the heavy instantiation time.
*/
private static final Schema HTML_SCHEMA = new HTMLSchema();
@@ -73,11 +74,14 @@ public void parse(
AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata, LOADER);
try {
- if (metadata.get(Metadata.CONTENT_TYPE) == null) {
- // TODO: Include charset
- metadata.set(Metadata.CONTENT_TYPE, "text/html");
+ Charset charset = reader.getCharset();
+ String previous = metadata.get(Metadata.CONTENT_TYPE);
+ if (previous == null || previous.startsWith("text/html")) {
+ MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
- metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper =
View
16 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -22,9 +22,7 @@
import java.io.InputStream;
import java.nio.channels.FileChannel;
import java.util.Collections;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -76,10 +74,12 @@
public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
/** An OLE10 Native embedded document within another OLE2 document */
- public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native"));
+ public static final MediaType OLE10_NATIVE =
+ new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
/** Some other kind of embedded document, in a CompObj container within another OLE2 document */
- public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj"));
+ public static final MediaType COMP_OBJ =
+ new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
/** Microsoft Excel */
public static final MediaType XLS = application("vnd.ms-excel");
@@ -122,13 +122,7 @@
/** Regexp for matching the MPP Project Data stream */
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
-
- private static Map<String,String> format(String format) {
- Map<String, String> params = new HashMap<String, String>();
- params.put("format", format);
- return params;
- }
-
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
// Check if we have access to the document
View
22 tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Set;
@@ -36,20 +37,14 @@
/**
* Plain text parser. The text encoding of the document stream is
* automatically detected based on the byte patterns found at the
- * beginning of the stream. The input metadata key
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used
- * as an encoding hint if the automatic encoding detection fails.
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
* <p>
* This parser sets the following output metadata entries:
* <dl>
* <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
- * <dd><code>text/plain</code></dd>
- * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}</dt>
- * <dd>The detected text encoding of the document.</dd>
- * <dt>
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and
- * {@link org.apache.tika.metadata.DublinCore#LANGUAGE}
- * </dt>
+ * <dd><code>text/plain; charset=...</code></dd>
* </dl>
*/
public class TXTParser extends AbstractParser {
@@ -75,8 +70,11 @@ public void parse(
AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata, LOADER);
try {
- metadata.set(Metadata.CONTENT_TYPE, "text/plain"); // TODO: charset
- metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
+ Charset charset = reader.getCharset();
+ MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
View
33 tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
@@ -18,6 +18,7 @@
import java.nio.charset.Charset;
+import org.apache.tika.detect.TextStatistics;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
@@ -33,14 +34,16 @@
private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
+ private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
+
+ private final TextStatistics statistics = new TextStatistics();
+
private final UniversalDetector detector = new UniversalDetector(this);
private String hint = null;
private Charset charset = null;
- private boolean hasCR = false;
-
public UniversalEncodingListener(Metadata metadata) {
MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
if (type != null) {
@@ -54,11 +57,20 @@ public UniversalEncodingListener(Metadata metadata) {
public void report(String name) {
if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
if (hint != null) {
- // Use the encoding hint to distinguish between latin charsets
+ // Use the encoding hint when available
name = hint;
- } else if (!hasCR) {
- // If there are no CRLFs, it's more likely to be ISO-8859-1
- name = CHARSET_ISO_8859_1;
+ } else if (statistics.count('\r') == 0) {
+ // If there are no CR(LF)s, then the encoding is more
+ // likely to be ISO-8859-1(5) than windows-1252
+ if (statistics.count(0xa4) > 0) { // currency/euro sign
+ // The general currency sign is hardly ever used in
+ // ISO-8859-1, so it's more likely that we're dealing
+ // with ISO-8859-15, where the character is used for
+ // the euro symbol, which is more commonly used.
+ name = CHARSET_ISO_8859_15;
+ } else {
+ name = CHARSET_ISO_8859_1;
+ }
}
}
try {
@@ -73,16 +85,15 @@ public boolean isDone() {
}
public void handleData(byte[] buf, int offset, int length) {
- for (int i = 0; !hasCR && i < length; i++) {
- if (buf[offset + i] == '\r') {
- hasCR = true;
- }
- }
+ statistics.addData(buf, offset, length);
detector.handleData(buf, offset, length);
}
public Charset dataEnd() {
detector.dataEnd();
+ if (charset == null && statistics.isMostlyAscii()) {
+ report(Constants.CHARSET_WINDOWS_1252);
+ }
return charset;
}
View
32 tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
@@ -19,11 +19,6 @@
import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-
-import org.apache.tika.metadata.Metadata;
-
import junit.framework.TestCase;
public class MimeTypesTest extends TestCase {
@@ -95,31 +90,4 @@ public void testCompareTo() {
assertTrue(html.compareTo(html) == 0);
}
- /** Test getMimeType(byte[])
- * @throws IOException */
- public void testGetMimeType_byteArray() throws IOException {
- // Plain text detection
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xEF, (byte) 0xFB, (byte) 0xBF });
- assertText(new byte[] { 'a', 'b', 'c' });
- assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
- assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
- }
-
- private void assertText(byte[] prefix) throws IOException {
- assertMagic("text/plain", prefix);
- }
-
- private void assertNotText(byte[] prefix) throws IOException {
- assertMagic("application/octet-stream", prefix);
- }
-
- private void assertMagic(String expected, byte[] prefix) throws IOException {
- MediaType type =
- types.detect(new ByteArrayInputStream(prefix), new Metadata());
- assertNotNull(type);
- assertEquals(expected, type.toString());
- }
-
}
View
26 tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -609,6 +609,32 @@ public void testEmlx() throws IOException {
assertTypeDetection("testEMLX.emlx", "message/x-emlx");
}
+ /** Test getMimeType(byte[]) */
+ public void testGetMimeType_byteArray() throws IOException {
+ // Plain text detection
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
+ assertText(new byte[] { 'a', 'b', 'c' });
+ assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+ assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+ }
+
+ private void assertText(byte[] prefix) throws IOException {
+ assertMagic("text/plain", prefix);
+ }
+
+ private void assertNotText(byte[] prefix) throws IOException {
+ assertMagic("application/octet-stream", prefix);
+ }
+
+ private void assertMagic(String expected, byte[] prefix) throws IOException {
+ MediaType type =
+ repo.detect(new ByteArrayInputStream(prefix), new Metadata());
+ assertNotNull(type);
+ assertEquals(expected, type.toString());
+ }
+
private void assertType(String expected, String filename) throws Exception {
InputStream stream = TestMimeTypes.class.getResourceAsStream(
"/test-documents/" + filename);
View
8 tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -28,7 +28,6 @@
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.BodyContentHandler;
@@ -40,14 +39,14 @@
// Easy to read constants for the MIME types:
private static final String RAW = "application/octet-stream";
private static final String EXCEL = "application/vnd.ms-excel";
- private static final String HTML = "text/html";
+ private static final String HTML = "text/html; charset=ISO-8859-1";
private static final String PDF = "application/pdf";
private static final String POWERPOINT = "application/vnd.ms-powerpoint";
private static final String KEYNOTE = "application/vnd.apple.keynote";
private static final String PAGES = "application/vnd.apple.pages";
private static final String NUMBERS = "application/vnd.apple.numbers";
private static final String RTF = "application/rtf";
- private static final String PLAINTEXT = "text/plain";
+ private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
private static final String WORD = "application/msword";
private static final String XML = "application/xml";
private static final String RSS = "application/rss+xml";
@@ -236,11 +235,12 @@ public void testZipBombPrevention() throws Exception {
}
}
-
+
/**
* Test to ensure that the Vorbis and FLAC parsers have been correctly
* included, and are available
*/
+ @SuppressWarnings("deprecation")
public void testVorbisFlac() throws Exception {
// The three test files should all have similar test data
String[] testFiles = new String[] {
View
2  tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -571,7 +571,7 @@ public void testMetaTagHandling() throws Exception {
String result = sw.toString();
// <meta> tag for Content-Type should exist, but nothing for Language
- assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=utf-8\"/>.*$", result));
+ assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
}
View
85 tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -26,6 +26,7 @@
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
import junit.framework.TestCase;
@@ -42,14 +43,14 @@ public void testEnglishText() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(
- new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new ByteArrayInputStream(text.getBytes("ISO-8859-1")),
new WriteOutContentHandler(writer),
metadata,
new ParseContext());
String content = writer.toString();
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
// TIKA-501: Remove language detection from TXTParser
assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
@@ -68,8 +69,8 @@ public void testUTF8Text() throws Exception {
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
assertTrue(handler.toString().contains(text));
}
@@ -79,11 +80,50 @@ public void testEmptyText() throws Exception {
Metadata metadata = new Metadata();
parser.parse(
new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
/**
+ * Test for the heuristics that we use to assign an eight-bit character
+ * encoding to mostly ASCII sequences. If a more specific match can not
+ * be made, a string with a CR(LF) in it is most probably windows-1252,
+ * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+ * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+ */
+ public void testLatinDetectionHeuristics() throws Exception {
+ String windows = "test\r\n";
+ String unix = "test\n";
+ String euro = "test \u20ac\n";
+
+ Metadata metadata;
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=windows-1252",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-1",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+ new DefaultHandler(), metadata, new ParseContext());
+ assertEquals(
+ "text/plain; charset=ISO-8859-15",
+ metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
* Test case for TIKA-240: Drop the BOM when extracting plain text
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
@@ -111,15 +151,15 @@ public void testUseIncomingCharsetAsHint() throws Exception {
parser.parse(
new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
- metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-15");
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
parser.parse(
new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
/**
@@ -136,16 +176,16 @@ public void testUsingCharsetInContentTypeHeader() throws Exception {
parser.parse(
new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
parser.parse(
new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
private void assertExtractText(String msg, String expected, byte[] input)
@@ -157,7 +197,6 @@ public void ignorableWhitespace(char[] ch, int off, int len) {
};
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals(msg, expected, handler.toString());
}
@@ -188,8 +227,7 @@ public void testCP866() throws Exception {
metadata,
new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
}
public void testEBCDIC_CP500() throws Exception {
@@ -201,19 +239,18 @@ public void testEBCDIC_CP500() throws Exception {
metadata,
new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
-
+ assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
+
// Additional check that it isn't too eager on short blocks of text
metadata = new Metadata();
writer = new StringWriter();
parser.parse(
- new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("UTF-8")),
+ new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("ISO-8859-1")),
new WriteOutContentHandler(writer),
metadata,
new ParseContext());
- assertNotSame("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.