TIKA-4196 -- add a bom EncodingDetector (#1590)

apache · Feb 12, 2024 · 7c758c3 · 7c758c3
1 parent 9147409
commit 7c758c3
Show file tree

Hide file tree

Showing 2 changed files with 184 additions and 0 deletions.
diff --git a/...modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java b/...modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.IOUtils;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class BOMDetector implements EncodingDetector {
+
+    private static final ByteOrderMark[] BOMS =
+            //order matters -- have to try the 32 before the 16
+            new ByteOrderMark[] {
+                    ByteOrderMark.UTF_8,
+                    ByteOrderMark.UTF_32BE,
+                    ByteOrderMark.UTF_32LE,
+                    ByteOrderMark.UTF_16BE,
+                    ByteOrderMark.UTF_16LE
+            };
+    private static final Charset[] CHARSETS = new Charset[BOMS.length];
+
+    private static final int MIN_BYTES = 2;
+    private static final int MAX_BYTES = 4;
+
+    static {
+        for (int i = 0; i < BOMS.length; i++) {
+            try {
+                CHARSETS[i] = Charset.forName(BOMS[i].getCharsetName());
+            } catch (UnsupportedCharsetException e) {
+                //log it
+            }
+        }
+    }
+    @Override
+    public Charset detect(InputStream input, Metadata metadata) throws IOException {
+        input.mark(MAX_BYTES);
+        byte[] bytes = new byte[MAX_BYTES];
+        try {
+            int numRead = IOUtils.read(input, bytes);
+            if (numRead < MIN_BYTES) {
+                return null;
+            } else if (numRead < MAX_BYTES) {
+                //s
+                byte[] tmpBytes = new byte[numRead];
+                System.arraycopy(bytes, 0, tmpBytes, 0, numRead);
+                bytes = tmpBytes;
+            }
+        } finally {
+            input.reset();
+        }
+        for (int i = 0; i < BOMS.length; i++) {
+            ByteOrderMark bom = BOMS[i];
+            if (startsWith(bom, bytes)) {
+                return CHARSETS[i];
+            }
+        }
+        return null;
+    }
+
+    private boolean startsWith(ByteOrderMark bom, byte[] bytes) {
+        byte[] bomBytes = bom.getBytes();
+        if (bytes.length < bomBytes.length) {
+            return false;
+        }
+        for (int i = 0; i < bomBytes.length; i++) {
+            if (bomBytes[i] != bytes[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
diff --git a/...les/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java b/...les/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.BOMInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class BOMDetectorTest extends TikaTest {
+    @Test
+    public void testBasic() throws Exception {
+        EncodingDetector detector = new BOMDetector();
+        for (ByteOrderMark bom : new ByteOrderMark[]{
+                ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
+                ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE
+        }) {
+            UnsynchronizedByteArrayOutputStream bos = createStream(bom);
+            try (BOMInputStream bomInputStream =
+                         new BOMInputStream(new UnsynchronizedByteArrayInputStream(bos.toByteArray()),
+                                 ByteOrderMark.UTF_8, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE,
+                                 ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE)) {
+                assertEquals(bom, bomInputStream.getBOM());
+            }
+            try (UnsynchronizedByteArrayInputStream is =
+                         new UnsynchronizedByteArrayInputStream(bos.toByteArray())) {
+                assertEquals(Charset.forName(bom.getCharsetName()), detector.detect(is, new Metadata()));
+                int cnt = 0;
+                int c = is.read();
+                while (c > -1) {
+                    cnt++;
+                    c = is.read();
+                }
+                assertEquals(100 + bom.getBytes().length, cnt);
+            }
+        }
+    }
+
+    @Test
+    public void testShort() throws Exception {
+        EncodingDetector detector = new BOMDetector();
+        for (ByteOrderMark bom : new ByteOrderMark[] {
+                ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
+                ByteOrderMark.UTF_32LE
+        }) {
+            byte[] bytes = new byte[3];
+            System.arraycopy(bom.getBytes(), 0, bytes, 0, 1);
+            bytes[1] = (byte)32;
+            bytes[2] = (byte)32;
+            try (InputStream is = new UnsynchronizedByteArrayInputStream(bytes)) {
+                assertNull(detector.detect(is, new Metadata()));
+            }
+        }
+    }
+
+    private UnsynchronizedByteArrayOutputStream createStream(ByteOrderMark bom) throws IOException {
+        UnsynchronizedByteArrayOutputStream bos = new UnsynchronizedByteArrayOutputStream();
+        IOUtils.write(bom.getBytes(), bos);
+        for (int i = 0; i < 100; i++) {
+            bos.write(' ');
+        }
+        return bos;
+    }
+}