From 5a7ecf1a52d078cf005082c22d302ee9dcc2faf0 Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Fri, 16 Jun 2017 20:17:13 -0400 Subject: [PATCH 1/9] COMPRESS-410 Remove Non-NIO character set encoders. As a special case, the UTF-8 encoder will replace malformed / unmappable input with '?'. This behavior is required for compatibility with existing behavior. Signed-off-by: Simon Spero (cherry picked from commit 0d41ac4) Signed-off-by: Simon Spero --- .../archivers/zip/FallbackZipEncoding.java | 96 ------ .../compress/archivers/zip/HasCharset.java | 12 + .../archivers/zip/Simple8BitZipEncoding.java | 279 ------------------ .../archivers/zip/ZipEncodingHelper.java | 165 ++--------- .../archivers/zip/ZipEncodingTest.java | 51 +++- 5 files changed, 77 insertions(+), 526 deletions(-) delete mode 100644 src/main/java/org/apache/commons/compress/archivers/zip/FallbackZipEncoding.java create mode 100644 src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java delete mode 100644 src/main/java/org/apache/commons/compress/archivers/zip/Simple8BitZipEncoding.java diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/FallbackZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/FallbackZipEncoding.java deleted file mode 100644 index 757bcbda97..0000000000 --- a/src/main/java/org/apache/commons/compress/archivers/zip/FallbackZipEncoding.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.commons.compress.archivers.zip; - -import java.io.IOException; -import java.nio.ByteBuffer; - -/** - * A fallback ZipEncoding, which uses a java.io means to encode names. - * - *

This implementation is not suitable for encodings other than - * UTF-8, because java.io encodes unmappable character as question - * marks leading to unreadable ZIP entries on some operating - * systems.

- * - *

Furthermore this implementation is unable to tell whether a - * given name can be safely encoded or not.

- * - *

This implementation acts as a last resort implementation, when - * neither {@link Simple8BitZipEnoding} nor {@link NioZipEncoding} is - * available.

- * - *

The methods of this class are reentrant.

- * @Immutable - */ -class FallbackZipEncoding implements ZipEncoding { - private final String charsetName; - - /** - * Construct a fallback zip encoding, which uses the platform's - * default charset. - */ - public FallbackZipEncoding() { - this.charsetName = null; - } - - /** - * Construct a fallback zip encoding, which uses the given charset. - * - * @param charsetName The name of the charset or {@code null} for - * the platform's default character set. - */ - public FallbackZipEncoding(final String charsetName) { - this.charsetName = charsetName; - } - - /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#canEncode(java.lang.String) - */ - @Override - public boolean canEncode(final String name) { - return true; - } - - /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#encode(java.lang.String) - */ - @Override - public ByteBuffer encode(final String name) throws IOException { - if (this.charsetName == null) { // i.e. use default charset, see no-args constructor - return ByteBuffer.wrap(name.getBytes()); - } - return ByteBuffer.wrap(name.getBytes(this.charsetName)); - } - - /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#decode(byte[]) - */ - @Override - public String decode(final byte[] data) throws IOException { - if (this.charsetName == null) { // i.e. use default charset, see no-args constructor - return new String(data); - } - return new String(data,this.charsetName); - } -} diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java new file mode 100644 index 0000000000..09dfcede13 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java @@ -0,0 +1,12 @@ +package org.apache.commons.compress.archivers.zip; + +import java.nio.charset.Charset; + +public interface HasCharset { + + /** + * + * @return the character set associated with this object + */ + Charset getCharset(); +} diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/Simple8BitZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/Simple8BitZipEncoding.java deleted file mode 100644 index 1bd0f9c7b6..0000000000 --- a/src/main/java/org/apache/commons/compress/archivers/zip/Simple8BitZipEncoding.java +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.commons.compress.archivers.zip; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * This ZipEncoding implementation implements a simple 8bit character - * set, which mets the following restrictions: - * - *
    - *
  • Characters 0x0000 to 0x007f are encoded as the corresponding - * byte values 0x00 to 0x7f.
  • - *
  • All byte codes from 0x80 to 0xff are mapped to a unique unicode - * character in the range 0x0080 to 0x7fff. (No support for - * UTF-16 surrogates) - *
- * - *

These restrictions most notably apply to the most prominent - * omissions of java-1.4's {@link java.nio.charset.Charset Charset} - * implementation, Cp437 and Cp850.

- * - *

The methods of this class are reentrant.

- * @Immutable - */ -class Simple8BitZipEncoding implements ZipEncoding { - - /** - * A character entity, which is put to the reverse mapping table - * of a simple encoding. - */ - private static final class Simple8BitChar implements Comparable { - public final char unicode; - public final byte code; - - Simple8BitChar(final byte code, final char unicode) { - this.code = code; - this.unicode = unicode; - } - - @Override - public int compareTo(final Simple8BitChar a) { - return this.unicode - a.unicode; - } - - @Override - public String toString() { - return "0x" + Integer.toHexString(0xffff & unicode) - + "->0x" + Integer.toHexString(0xff & code); - } - - @Override - public boolean equals(final Object o) { - if (o instanceof Simple8BitChar) { - final Simple8BitChar other = (Simple8BitChar) o; - return unicode == other.unicode && code == other.code; - } - return false; - } - - @Override - public int hashCode() { - return unicode; - } - } - - /** - * The characters for byte values of 128 to 255 stored as an array of - * 128 chars. - */ - private final char[] highChars; - - /** - * A list of {@link Simple8BitChar} objects sorted by the unicode - * field. This list is used to binary search reverse mapping of - * unicode characters with a character code greater than 127. - */ - private final List reverseMapping; - - /** - * @param highChars The characters for byte values of 128 to 255 - * stored as an array of 128 chars. - */ - public Simple8BitZipEncoding(final char[] highChars) { - this.highChars = highChars.clone(); - final List temp = - new ArrayList<>(this.highChars.length); - - byte code = 127; - - for (final char highChar : this.highChars) { - temp.add(new Simple8BitChar(++code, highChar)); - } - - Collections.sort(temp); - this.reverseMapping = Collections.unmodifiableList(temp); - } - - /** - * Return the character code for a given encoded byte. - * - * @param b The byte to decode. - * @return The associated character value. - */ - public char decodeByte(final byte b) { - // code 0-127 - if (b >= 0) { - return (char) b; - } - - // byte is signed, so 128 == -128 and 255 == -1 - return this.highChars[128 + b]; - } - - /** - * @param c The character to encode. - * @return Whether the given unicode character is covered by this encoding. - */ - public boolean canEncodeChar(final char c) { - - if (c >= 0 && c < 128) { - return true; - } - - final Simple8BitChar r = this.encodeHighChar(c); - return r != null; - } - - /** - * Pushes the encoded form of the given character to the given byte buffer. - * - * @param bb The byte buffer to write to. - * @param c The character to encode. - * @return Whether the given unicode character is covered by this encoding. - * If {@code false} is returned, nothing is pushed to the - * byte buffer. - */ - public boolean pushEncodedChar(final ByteBuffer bb, final char c) { - - if (c >= 0 && c < 128) { - bb.put((byte) c); - return true; - } - - final Simple8BitChar r = this.encodeHighChar(c); - if (r == null) { - return false; - } - bb.put(r.code); - return true; - } - - /** - * @param c A unicode character in the range from 0x0080 to 0x7f00 - * @return A Simple8BitChar, if this character is covered by this encoding. - * A {@code null} value is returned, if this character is not - * covered by this encoding. - */ - private Simple8BitChar encodeHighChar(final char c) { - // for performance an simplicity, yet another reincarnation of - // binary search... - int i0 = 0; - int i1 = this.reverseMapping.size(); - - while (i1 > i0) { - - final int i = i0 + (i1 - i0) / 2; - - final Simple8BitChar m = this.reverseMapping.get(i); - - if (m.unicode == c) { - return m; - } - - if (m.unicode < c) { - i0 = i + 1; - } else { - i1 = i; - } - } - - if (i0 >= this.reverseMapping.size()) { - return null; - } - - final Simple8BitChar r = this.reverseMapping.get(i0); - - if (r.unicode != c) { - return null; - } - - return r; - } - - /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#canEncode(java.lang.String) - */ - @Override - public boolean canEncode(final String name) { - - for (int i=0;i simpleEncodings; - - static { - final Map se = - new HashMap<>(); - - final char[] cp437_high_chars = - new char[] { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, - 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, - 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, - 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, - 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, - 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, - 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, - 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, - 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, - 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, - 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, 0x2534, - 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, - 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, - 0x256c, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, - 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, - 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, - 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, - 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, - 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, - 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, - 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, - 0x25a0, 0x00a0 }; - - final SimpleEncodingHolder cp437 = new SimpleEncodingHolder(cp437_high_chars); - - se.put("CP437", cp437); - se.put("Cp437", cp437); - se.put("cp437", cp437); - se.put("IBM437", cp437); - se.put("ibm437", cp437); - - final char[] cp850_high_chars = - new char[] { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, - 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, - 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, - 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, - 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, - 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, - 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, - 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, - 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, - 0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, - 0x255d, 0x00a2, 0x00a5, 0x2510, 0x2514, 0x2534, - 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, - 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, - 0x256c, 0x00a4, 0x00f0, 0x00d0, 0x00ca, 0x00cb, - 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518, - 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, - 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, - 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9, - 0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, - 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, - 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, - 0x25a0, 0x00a0 }; - - final SimpleEncodingHolder cp850 = new SimpleEncodingHolder(cp850_high_chars); - - se.put("CP850", cp850); - se.put("Cp850", cp850); - se.put("cp850", cp850); - se.put("IBM850", cp850); - se.put("ibm850", cp850); - simpleEncodings = Collections.unmodifiableMap(se); - } - /** * Grow a byte buffer, so it has a minimal capacity or at least - * the double capacity of the original buffer - * + * the double capacity of the original buffer + * * @param b The original buffer. * @param newCapacity The minimal requested new capacity. * @return A byte buffer r with @@ -160,7 +51,7 @@ static ByteBuffer growBuffer(final ByteBuffer b, final int newCapacity) { return on; } - + /** * The hexadecimal digits 0,...,9,A,...,F encoded as * ASCII bytes. @@ -174,7 +65,7 @@ static ByteBuffer growBuffer(final ByteBuffer b, final int newCapacity) { /** * Append %Uxxxx to the given byte buffer. * The caller must assure, that bb.remaining()>=6. - * + * * @param bb The byte buffer to write to. * @param c The character to write. */ @@ -198,47 +89,37 @@ static void appendSurrogate(final ByteBuffer bb, final char c) { /** * name of the encoding UTF-8 */ - static final ZipEncoding UTF8_ZIP_ENCODING = new FallbackZipEncoding(UTF8); + static final ZipEncoding UTF8_ZIP_ENCODING = getZipEncoding("UTF-8"); /** - * Instantiates a zip encoding. - * + * Instantiates a zip encoding. An NIO based character set encoder/decoder will be returned. + * As a special case, if the character set is UTF-8, the nio encoder will be configured replace malformed and + * unmappable characters with '?'. This matches existing behavior from the older fallback encoder. + *

+ * If the requested characer set cannot be found, the platform default will + * be used instead. + *

* @param name The name of the zip encoding. Specify {@code null} for * the platform's default encoding. * @return A zip encoding for the given encoding name. */ public static ZipEncoding getZipEncoding(final String name) { - - // fallback encoding is good enough for UTF-8. - if (isUTF8(name)) { - return UTF8_ZIP_ENCODING; - } - - if (name == null) { - return new FallbackZipEncoding(); - } - - final SimpleEncodingHolder h = simpleEncodings.get(name); - - if (h!=null) { - return h.getEncoding(); + Charset cs = Charset.defaultCharset(); + if (name != null) { + try { + cs = Charset.forName(name); + } catch (UnsupportedCharsetException e) { + } } + boolean useReplacement = cs.name().equals("UTF-8"); + return new NioZipEncoding(cs, useReplacement); - try { - - final Charset cs = Charset.forName(name); - return new NioZipEncoding(cs); - - } catch (final UnsupportedCharsetException e) { - return new FallbackZipEncoding(name); - } } /** * Returns whether a given encoding is UTF-8. If the given name is null, then check the platform's default encoding. - * - * @param charsetName - * If the given name is null, then check the platform's default encoding. + * + * @param charsetName If the given name is null, then check the platform's default encoding. */ static boolean isUTF8(String charsetName) { if (charsetName == null) { diff --git a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java index f0c049a039..f3e512719c 100644 --- a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java +++ b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java @@ -19,13 +19,16 @@ package org.apache.commons.compress.archivers.zip; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; + import java.io.IOException; import java.nio.ByteBuffer; - +import java.nio.charset.Charset; import org.apache.commons.compress.utils.CharsetNames; - -import static org.junit.Assert.*; - +import org.hamcrest.core.IsInstanceOf; import org.junit.Assert; import org.junit.Test; @@ -33,6 +36,7 @@ * Test zip encodings. */ public class ZipEncodingTest { + private static final String UNENC_STRING = "\u2016"; // stress test for internal grow method. @@ -43,15 +47,44 @@ public class ZipEncodingTest { "%U2016%U2015%U2016%U2015%U2016%U2015%U2016%U2015%U2016%U2015%U2016"; @Test - public void testSimpleCp437Encoding() throws IOException { - - doSimpleEncodingTest("Cp437", null); + public void testNothingToMakeCoverallsHappier() { + Object o = new ZipEncodingHelper() { + }; + assertNotNull(o); + } + @Test + public void testGetNonexistentEncodng() throws IOException { + ZipEncoding ze = ZipEncodingHelper.getZipEncoding("I-am-a-banana"); + assertNotNull(ze); + if (ze instanceof HasCharset) { + HasCharset hasCharset = (HasCharset) ze; + Assert.assertEquals(Charset.defaultCharset(),hasCharset.getCharset()); + } } + @Test + public void testIsUTF8() throws IOException { + assertTrue(ZipEncodingHelper.isUTF8("UTF-8")); + assertTrue(ZipEncodingHelper.isUTF8("UTF8")); + Assert.assertEquals(Charset.defaultCharset().name().equals("UTF-8"),ZipEncodingHelper.isUTF8(null)); + } + @Test + public void testSimpleCp437Encoding() throws IOException { + doSimpleEncodingsTest(437); + } @Test public void testSimpleCp850Encoding() throws IOException { + doSimpleEncodingsTest(850); + } + - doSimpleEncodingTest("Cp850", null); + private void doSimpleEncodingsTest(int n) throws IOException { + + doSimpleEncodingTest("Cp" + n, null); + doSimpleEncodingTest("cp" + n, null); + doSimpleEncodingTest("CP" + n, null); + doSimpleEncodingTest("IBM" + n, null); + doSimpleEncodingTest("ibm" + n, null); } @Test @@ -127,7 +160,7 @@ private void doSimpleEncodingTest(final String name, byte[] testBytes) throws IOException { final ZipEncoding enc = ZipEncodingHelper.getZipEncoding(name); - + assertThat(enc, IsInstanceOf.instanceOf(NioZipEncoding.class)); if (testBytes == null) { testBytes = new byte[256]; From 8125dccb36792fdcc83c7359d13ecade39852562 Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Sat, 17 Jun 2017 12:45:44 -0400 Subject: [PATCH 2/9] javadoc for HasCharset Signed-off-by: Simon Spero (cherry picked from commit b70c7c2) Signed-off-by: Simon Spero --- .../commons/compress/archivers/zip/HasCharset.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java index 09dfcede13..2e392a8aac 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java @@ -2,10 +2,22 @@ import java.nio.charset.Charset; +/** + * An interface added to allow access to the character set associated with an {@link NioZipEncoding}, + * without requiring a new method to be added to {@link ZipEncoding}. + *

+ * This avoids introducing a + * potentially breaking change, or making {@link NioZipEncoding} a public class. + *

+ */ public interface HasCharset { /** - * + * Provides access to the character set associated with an object. + *

+ * This allows nio oriented code to use more natural character encoding/decoding methods, + * whilst allowing existing code to continue to rely on special-case error handling for UTF-8. + *

* @return the character set associated with this object */ Charset getCharset(); From cf3fa1ecd677b0d405e6018b91aa13f90ecb3b7c Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Fri, 16 Jun 2017 20:17:13 -0400 Subject: [PATCH 3/9] COMPRESS-410 Remove Non-NIO character set encoders. As a special case, the UTF-8 encoder will replace malformed / unmappable input with '?'. This behavior is required for compatibility with existing behavior. Signed-off-by: Simon Spero (cherry picked from commit 1987719) Signed-off-by: Simon Spero --- .../archivers/zip/NioZipEncoding.java | 80 +++++++++++++------ 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java index ffd2efd5a3..6f0306b5c3 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java @@ -23,6 +23,7 @@ import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; @@ -30,54 +31,84 @@ /** * A ZipEncoding, which uses a java.nio {@link * java.nio.charset.Charset Charset} to encode names. - * - *

This implementation works for all cases under java-1.5 or - * later. However, in java-1.4, some charsets don't have a java.nio - * implementation, most notably the default ZIP encoding Cp437.

- * *

The methods of this class are reentrant.

* @Immutable */ -class NioZipEncoding implements ZipEncoding { +class NioZipEncoding implements ZipEncoding,HasCharset { + private final Charset charset; + private boolean useReplacement= false; + private static final byte[] REPLACEMENT_BYTES = new byte[]{'?'}; + private static final String REPLACEMENT_STRING = "?"; /** * Construct an NIO based zip encoding, which wraps the given * charset. - * + * * @param charset The NIO charset to wrap. */ - public NioZipEncoding(final Charset charset) { + NioZipEncoding(final Charset charset) { this.charset = charset; } + NioZipEncoding(final Charset charset, boolean useReplacement) { + this(charset); + this.useReplacement = useReplacement; + + } + + @Override + public Charset getCharset() { + return charset; + } + /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#canEncode(java.lang.String) + * @see ZipEncoding#canEncode(java.lang.String) */ @Override public boolean canEncode(final String name) { - final CharsetEncoder enc = this.charset.newEncoder(); - enc.onMalformedInput(CodingErrorAction.REPORT); - enc.onUnmappableCharacter(CodingErrorAction.REPORT); + final CharsetEncoder enc = newEncoder(); return enc.canEncode(name); } + private CharsetEncoder newEncoder() { + if (useReplacement) { + return charset.newEncoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .replaceWith(REPLACEMENT_BYTES); + } else { + return charset.newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + } + } + + private CharsetDecoder newDecoder() { + if (!useReplacement) { + return this.charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + } else { + return charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .replaceWith(REPLACEMENT_STRING); + } + } + + /** - * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#encode(java.lang.String) + * @see ZipEncoding#encode(java.lang.String) */ @Override public ByteBuffer encode(final String name) { - final CharsetEncoder enc = this.charset.newEncoder(); - - enc.onMalformedInput(CodingErrorAction.REPORT); - enc.onUnmappableCharacter(CodingErrorAction.REPORT); + final CharsetEncoder enc = newEncoder(); final CharBuffer cb = CharBuffer.wrap(name); - ByteBuffer out = ByteBuffer.allocate(name.length() - + (name.length() + 1) / 2); + int estimatedSize = (int) Math.ceil(name.length() * enc.averageBytesPerChar()); + ByteBuffer out = ByteBuffer.allocate(estimatedSize); while (cb.remaining() > 0) { final CoderResult res = enc.encode(cb, out,true); @@ -114,13 +145,12 @@ public ByteBuffer encode(final String name) { /** * @see - * org.apache.commons.compress.archivers.zip.ZipEncoding#decode(byte[]) + * ZipEncoding#decode(byte[]) */ @Override public String decode(final byte[] data) throws IOException { - return this.charset.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) + return newDecoder() .decode(ByteBuffer.wrap(data)).toString(); } + } From fb52100291ed3f69581f6a6dc5062a07e6d52ae8 Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Sun, 18 Jun 2017 18:55:38 -0400 Subject: [PATCH 4/9] Do better estimating of required buffer size for character encoding. If an unencodable character is found that requires output buffer expansion, scan buffer for all such characters, and attempt to expand buffer only once. Signed-off-by: Simon Spero (cherry picked from commit aa30e21) Signed-off-by: Simon Spero --- .../archivers/zip/NioZipEncoding.java | 109 +++++++++++++++--- .../archivers/zip/ZipEncodingHelper.java | 10 ++ .../archivers/zip/ZipEncodingTest.java | 5 +- 3 files changed, 104 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java index 6f0306b5c3..fed597f960 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java @@ -48,11 +48,11 @@ class NioZipEncoding implements ZipEncoding,HasCharset { * @param charset The NIO charset to wrap. */ NioZipEncoding(final Charset charset) { - this.charset = charset; + this(charset, false); } NioZipEncoding(final Charset charset, boolean useReplacement) { - this(charset); + this.charset = charset; this.useReplacement = useReplacement; } @@ -107,42 +107,115 @@ public ByteBuffer encode(final String name) { final CharsetEncoder enc = newEncoder(); final CharBuffer cb = CharBuffer.wrap(name); - int estimatedSize = (int) Math.ceil(name.length() * enc.averageBytesPerChar()); - ByteBuffer out = ByteBuffer.allocate(estimatedSize); + CharBuffer tmp=null; + ByteBuffer out = ByteBuffer.allocate(estimateInitialBufferSize(enc, cb.remaining())); while (cb.remaining() > 0) { - final CoderResult res = enc.encode(cb, out,true); + final CoderResult res = enc.encode(cb, out, false); if (res.isUnmappable() || res.isMalformed()) { // write the unmappable characters in utf-16 // pseudo-URL encoding style to ByteBuffer. - if (res.length() * 6 > out.remaining()) { - out = ZipEncodingHelper.growBuffer(out, out.position() - + res.length() * 6); - } - for (int i=0; i out.remaining()) { + // if the destination buffer isn't over sized, assume that the presence of one + // unmappable character makes it likely that there will be more. Find all the + // un-encoded characters and allocate space based on those estimates. + int charCount = 0; + for (int i = cb.position() ; i < cb.limit(); i++) { + if (!enc.canEncode(cb.get(i))) { + charCount+= 6; + } else { + charCount++; + } + } + int totalExtraSpace = estimateIncrementalEncodingSize(enc, charCount); + out = ZipEncodingHelper.growBufferBy(out, totalExtraSpace- out.remaining()); + } + if(tmp == null) { + tmp = CharBuffer.allocate(6); + } + for (int i = 0; i < res.length(); ++i) { + out = encodeFully(enc, encodeSurrogate(tmp,cb.get()), out); } } else if (res.isOverflow()) { + int increment = estimateIncrementalEncodingSize(enc, cb.remaining()); + out = ZipEncodingHelper.growBufferBy(out, increment); + } + } + CoderResult coderResult = enc.encode(cb, out, true); - out = ZipEncodingHelper.growBuffer(out, 0); + if (!coderResult.isUnderflow()) { + throw new RuntimeException("unexpected coder result: " + coderResult); + } - } else if (res.isUnderflow()) { + out.limit(out.position()); + out.rewind(); + return out; + } - enc.flush(out); + private static ByteBuffer encodeFully(CharsetEncoder enc, CharBuffer cb, ByteBuffer out) { + while (cb.hasRemaining()) { + CoderResult result = enc.encode(cb, out, false); + if (result.isOverflow()) { + int increment = estimateIncrementalEncodingSize(enc, cb.remaining()); + out = ZipEncodingHelper.growBufferBy(out, increment); + } else { break; - } } - - out.limit(out.position()); - out.rewind(); return out; } + static char[] HEX_CHARS = new char[]{ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' + }; + + private CharBuffer encodeSurrogate( CharBuffer cb,char c) { + cb.position(0).limit(6); + cb.put('%'); + cb.put('U'); + + cb.put(HEX_CHARS[(c >> 12) & 0x0f]); + cb.put(HEX_CHARS[(c >> 8) & 0x0f]); + cb.put(HEX_CHARS[(c >> 4) & 0x0f]); + cb.put(HEX_CHARS[c & 0x0f]); + cb.flip(); + return cb; + } + + /** + * Estimate the initial encoded size (in bytes) for a character buffer. + *

+ * The estimate assumes that one character consumes uses the maximum length encoding, + * whilst the rest use an average size encoding. This accounts for any BOM for UTF-16, at + * the expense of a couple of extra bytes for UTF-8 encoded ASCII. + *

+ * + * @param enc encoder to use for estimates + * @param charChount number of characters in string + * @return estimated size in bytes. + */ + private int estimateInitialBufferSize(CharsetEncoder enc, int charChount) { + float first = enc.maxBytesPerChar(); + float rest = (charChount - 1) * enc.averageBytesPerChar(); + return (int) Math.ceil(first + rest); + } + + /** + * Estimate the size needed for remaining characters + * + * @param enc encoder to use for estimates + * @param charCount number of characters remaining + * @return estimated size in bytes. + */ + private static int estimateIncrementalEncodingSize(CharsetEncoder enc, int charCount) { + return (int) Math.ceil(charCount * enc.averageBytesPerChar()); + } + /** * @see * ZipEncoding#decode(byte[]) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java index 18ad103c68..f31d75c8ee 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java @@ -136,4 +136,14 @@ static boolean isUTF8(String charsetName) { } return false; } + + static ByteBuffer growBufferBy(ByteBuffer buffer, int increment) { + buffer.limit(buffer.position()); + buffer.rewind(); + + final ByteBuffer on = ByteBuffer.allocate(buffer.capacity() + increment); + + on.put(buffer); + return on; + } } diff --git a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java index f3e512719c..ce0934f24a 100644 --- a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java +++ b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java @@ -180,8 +180,9 @@ private void doSimpleEncodingTest(final String name, byte[] testBytes) assertFalse(enc.canEncode(UNENC_STRING)); assertEquals("%U2016".getBytes(CharsetNames.US_ASCII), enc.encode(UNENC_STRING)); assertFalse(enc.canEncode(BAD_STRING)); - assertEquals(BAD_STRING_ENC.getBytes(CharsetNames.US_ASCII), - enc.encode(BAD_STRING)); + byte[] expected = BAD_STRING_ENC.getBytes(CharsetNames.US_ASCII); + ByteBuffer actual = enc.encode(BAD_STRING); + assertEquals(expected, actual); } } From 9f76e86f71d36897895e1e5f49b522d12602f3cc Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Thu, 15 Jun 2017 14:27:48 -0400 Subject: [PATCH 5/9] Redoing more buffer stuff Signed-off-by: Simon Spero (cherry picked from commit 330c8b3) Signed-off-by: Simon Spero --- .../commons/compress/internal/charset/HasCharset.java | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java diff --git a/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java b/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java new file mode 100644 index 0000000000..040643205d --- /dev/null +++ b/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java @@ -0,0 +1,10 @@ +package org.apache.commons.compress.internal.charset; + +import java.nio.charset.Charset; + +/** + * Interface to allow access to a character set associated with an object + */ +public interface HasCharset { + Charset getCharset(); +} From b92044a5fe95a8b6901040ee8b4c698c0a650e7d Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Sun, 18 Jun 2017 19:27:42 -0400 Subject: [PATCH 6/9] Test that ebcidic encoding is supported (making sure "%Uxxxx" replacement strings don't use ascii encodings) Signed-off-by: Simon Spero (cherry picked from commit f1ec715) Signed-off-by: Simon Spero --- .../compress/archivers/zip/ZipEncodingTest.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java index ce0934f24a..34a9cb8867 100644 --- a/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java +++ b/src/test/java/org/apache/commons/compress/archivers/zip/ZipEncodingTest.java @@ -27,7 +27,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.Charset; -import org.apache.commons.compress.utils.CharsetNames; import org.hamcrest.core.IsInstanceOf; import org.junit.Assert; import org.junit.Test; @@ -78,6 +77,13 @@ public void testSimpleCp850Encoding() throws IOException { } + @Test + public void testEbcidic() throws IOException { + + doSimpleEncodingTest("IBM1047", null); + } + + private void doSimpleEncodingsTest(int n) throws IOException { doSimpleEncodingTest("Cp" + n, null); @@ -178,11 +184,10 @@ private void doSimpleEncodingTest(final String name, byte[] testBytes) assertEquals(testBytes, encoded); assertFalse(enc.canEncode(UNENC_STRING)); - assertEquals("%U2016".getBytes(CharsetNames.US_ASCII), enc.encode(UNENC_STRING)); + assertEquals("%U2016".getBytes(name), enc.encode(UNENC_STRING)); assertFalse(enc.canEncode(BAD_STRING)); - byte[] expected = BAD_STRING_ENC.getBytes(CharsetNames.US_ASCII); - ByteBuffer actual = enc.encode(BAD_STRING); - assertEquals(expected, actual); + assertEquals(BAD_STRING_ENC.getBytes(name), enc.encode(BAD_STRING)); + assertEquals(BAD_STRING_ENC.getBytes(name), enc.encode(BAD_STRING)); } } From ae1e7b07f0e9483c07d33cf49862660cca3da81e Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Sun, 18 Jun 2017 20:04:35 -0400 Subject: [PATCH 7/9] Add licence comment to HasCharset Signed-off-by: Simon Spero --- .../compress/archivers/zip/HasCharset.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java index 2e392a8aac..7581c18dac 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/HasCharset.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.commons.compress.archivers.zip; import java.nio.charset.Charset; From 7e01241512f3f4059fa8d909bf7f98379ef4f373 Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Sun, 18 Jun 2017 20:10:46 -0400 Subject: [PATCH 8/9] Resurrected HasCharset in the wrong place (not beyond the grave). Signed-off-by: Simon Spero --- .../commons/compress/internal/charset/HasCharset.java | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java diff --git a/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java b/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java deleted file mode 100644 index 040643205d..0000000000 --- a/src/main/java/org/apache/commons/compress/internal/charset/HasCharset.java +++ /dev/null @@ -1,10 +0,0 @@ -package org.apache.commons.compress.internal.charset; - -import java.nio.charset.Charset; - -/** - * Interface to allow access to a character set associated with an object - */ -public interface HasCharset { - Charset getCharset(); -} From 922051d1f2250256697a1b042f1f319b2950fc45 Mon Sep 17 00:00:00 2001 From: Simon Spero Date: Mon, 19 Jun 2017 06:07:02 -0400 Subject: [PATCH 9/9] Remove methods and change test + throw to assert to please the coveralls Signed-off-by: Simon Spero --- .../archivers/zip/NioZipEncoding.java | 20 +++------ .../archivers/zip/ZipEncodingHelper.java | 41 ------------------- 2 files changed, 6 insertions(+), 55 deletions(-) diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java index fed597f960..606ab12344 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/NioZipEncoding.java @@ -42,15 +42,10 @@ class NioZipEncoding implements ZipEncoding,HasCharset { private static final String REPLACEMENT_STRING = "?"; /** - * Construct an NIO based zip encoding, which wraps the given - * charset. - * - * @param charset The NIO charset to wrap. + * Construct an NioZipEncoding using the given charset. + * @param charset The character set to use. + * @param useReplacement should invalid characters be replaced, or reported. */ - NioZipEncoding(final Charset charset) { - this(charset, false); - } - NioZipEncoding(final Charset charset, boolean useReplacement) { this.charset = charset; this.useReplacement = useReplacement; @@ -148,9 +143,8 @@ public ByteBuffer encode(final String name) { } CoderResult coderResult = enc.encode(cb, out, true); - if (!coderResult.isUnderflow()) { - throw new RuntimeException("unexpected coder result: " + coderResult); - } + assert coderResult.isUnderflow() : "unexpected coder result: " + coderResult; + out.limit(out.position()); out.rewind(); @@ -163,9 +157,7 @@ private static ByteBuffer encodeFully(CharsetEncoder enc, CharBuffer cb, ByteBuf if (result.isOverflow()) { int increment = estimateIncrementalEncodingSize(enc, cb.remaining()); out = ZipEncodingHelper.growBufferBy(out, increment); - } else { - break; - } + } } return out; } diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java index f31d75c8ee..fb550fd4f7 100644 --- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java +++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipEncodingHelper.java @@ -28,29 +28,6 @@ */ public abstract class ZipEncodingHelper { - /** - * Grow a byte buffer, so it has a minimal capacity or at least - * the double capacity of the original buffer - * - * @param b The original buffer. - * @param newCapacity The minimal requested new capacity. - * @return A byte buffer r with - * r.capacity() = max(b.capacity()*2,newCapacity) and - * all the data contained in b copied to the beginning - * of r. - * - */ - static ByteBuffer growBuffer(final ByteBuffer b, final int newCapacity) { - b.limit(b.position()); - b.rewind(); - - final int c2 = b.capacity() * 2; - final ByteBuffer on = ByteBuffer.allocate(c2 < newCapacity ? newCapacity : c2); - - on.put(b); - return on; - } - /** * The hexadecimal digits 0,...,9,A,...,F encoded as @@ -62,24 +39,6 @@ static ByteBuffer growBuffer(final ByteBuffer b, final int newCapacity) { 0x42, 0x43, 0x44, 0x45, 0x46 }; - /** - * Append %Uxxxx to the given byte buffer. - * The caller must assure, that bb.remaining()>=6. - * - * @param bb The byte buffer to write to. - * @param c The character to write. - */ - static void appendSurrogate(final ByteBuffer bb, final char c) { - - bb.put((byte) '%'); - bb.put((byte) 'U'); - - bb.put(HEX_DIGITS[(c >> 12)&0x0f]); - bb.put(HEX_DIGITS[(c >> 8)&0x0f]); - bb.put(HEX_DIGITS[(c >> 4)&0x0f]); - bb.put(HEX_DIGITS[c & 0x0f]); - } - /** * name of the encoding UTF-8