[CODEC-280] Added strict decoding property to BaseNCodec. (#35)

* [CODEC-280] Added strict decoding property to BaseNCodec. The default is lenient encoding. Any trailing bits that cannot be interpreted as part of the encoding are discarded. Combinations of trailing characters that cannot be created by a valid encoding are partially interpreted. If set to strict encoding the codec will raise an exception when trailing bits are present that are not part of a valid encoding. Added tests to show that in lenient mode re-encoding will create a different byte array. In strict mode the re-encoding should create the same byte array as the original (with appropriate pad characters on the input). * [CODEC-280] Added strict decoding property to BCodec. * [CODEC-280] Removed unused import @ignore from BCodecTest
apache · Jan 28, 2020 · 3c21223 · 3c21223
1 parent f5a61f0
commit 3c21223
Show file tree

Hide file tree

Showing 8 changed files with 220 additions and 35 deletions.
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
@@ -44,6 +44,7 @@ The <action> type attribute can be add,update,fix,remove.
 
     <release version="1.15" date="YYYY-MM-DD" description="Feature and fix release.">
       <action issue="CODEC-264" dev="aherbert" due-to="Andy Seaborne" type="fix">MurmurHash3: Ensure hash128 maintains the sign extension bug.</action>
+      <action issue="CODEC-280" dev="aherbert" type="update">Base32/Base64/BCodec: Added strict decoding property to control handling of trailing bits. Default lenient mode discards them without error. Strict mode raise an exception.</action>
     </release>
 
     <release version="1.14" date="2019-12-30" description="Feature and fix release.">

diff --git a/src/main/java/org/apache/commons/codec/binary/Base32.java b/src/main/java/org/apache/commons/codec/binary/Base32.java
@@ -114,10 +114,6 @@ public class Base32 extends BaseNCodec {
             'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
     };
 
-    /** Mask used to extract 7 bits, used when decoding final trailing character. */
-    private static final long MASK_7BITS = 0x7fL;
-    /** Mask used to extract 6 bits, used when decoding final trailing character. */
-    private static final long MASK_6BITS = 0x3fL;
     /** Mask used to extract 5 bits, used when encoding Base32 bytes */
     private static final int MASK_5BITS = 0x1f;
     /** Mask used to extract 4 bits, used when decoding final trailing character. */
@@ -387,17 +383,26 @@ void decode(final byte[] input, int inPos, final int inAvail, final Context cont
         // Two forms of EOF as far as Base32 decoder is concerned: actual
         // EOF (-1) and first time '=' character is encountered in stream.
         // This approach makes the '=' padding characters completely optional.
-        if (context.eof && context.modulus >= 2) { // if modulus < 2, nothing to do
+        if (context.eof && context.modulus > 0) { // if modulus == 0, nothing to do
             final byte[] buffer = ensureBufferSize(decodeSize, context);
 
-            //  we ignore partial bytes, i.e. only multiples of 8 count
+            // We ignore partial bytes, i.e. only multiples of 8 count.
+            // Any combination not part of a valid encoding is either partially decoded
+            // or will raise an exception. Possible trailing characters are 2, 4, 5, 7.
+            // It is not possible to encode with 1, 3, 6 trailing characters.
+            // For backwards compatibility 3 & 6 chars are decoded anyway rather than discarded.
+            // See the encode(byte[]) method EOF section.
             switch (context.modulus) {
+//              case 0 : // impossible, as excluded above
+                case 1 : // 5 bits - either ignore entirely, or raise an exception
+                    validateTrailingCharacters();
                 case 2 : // 10 bits, drop 2 and output one byte
                     validateCharacter(MASK_2BITS, context);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 2) & MASK_8BITS);
                     break;
-                case 3 : // 15 bits, drop 7 and output 1 byte
-                    validateCharacter(MASK_7BITS, context);
+                case 3 : // 15 bits, drop 7 and output 1 byte, or raise an exception
+                    validateTrailingCharacters();
+                    // Not possible from a valid encoding but decode anyway
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 7) & MASK_8BITS);
                     break;
                 case 4 : // 20 bits = 2*8 + 4
@@ -406,21 +411,22 @@ void decode(final byte[] input, int inPos, final int inAvail, final Context cont
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
                     break;
-                case 5 : // 25bits = 3*8 + 1
+                case 5 : // 25 bits = 3*8 + 1
                     validateCharacter(MASK_1BITS, context);
                     context.lbitWorkArea = context.lbitWorkArea >> 1;
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
                     break;
-                case 6 : // 30bits = 3*8 + 6
-                    validateCharacter(MASK_6BITS, context);
+                case 6 : // 30 bits = 3*8 + 6, or raise an exception
+                    validateTrailingCharacters();
+                    // Not possible from a valid encoding but decode anyway
                     context.lbitWorkArea = context.lbitWorkArea >> 6;
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
                     break;
-                case 7 : // 35 = 4*8 +3
+                case 7 : // 35 bits = 4*8 +3
                     validateCharacter(MASK_3BITS, context);
                     context.lbitWorkArea = context.lbitWorkArea >> 3;
                     buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 24) & MASK_8BITS);
@@ -559,6 +565,20 @@ public boolean isInAlphabet(final byte octet) {
         return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
     }
 
+    /**
+     * Validates whether decoding allows final trailing characters that cannot be
+     * created during encoding.
+     *
+     * @throws IllegalArgumentException if strict decoding is enabled
+     */
+    private void validateTrailingCharacters() {
+        if (isStrictDecoding()) {
+            throw new IllegalArgumentException(
+                "Strict decoding: Last encoded character(s) (before the paddings if any) are valid base 32 alphabet but not a possible encoding. " +
+                "Decoding requries either 2, 4, 5, or 7 trailing 5-bit characters to create bytes.");
+        }
+    }
+
     /**
      * Validates whether decoding the final trailing character is possible in the context
      * of the set of possible base 32 values.
@@ -571,12 +591,12 @@ public boolean isInAlphabet(final byte octet) {
      *
      * @throws IllegalArgumentException if the bits being checked contain any non-zero value
      */
-    private static void validateCharacter(final long emptyBitsMask, final Context context) {
+    private void validateCharacter(final long emptyBitsMask, final Context context) {
         // Use the long bit work area
-        if ((context.lbitWorkArea & emptyBitsMask) != 0) {
+        if (isStrictDecoding() && (context.lbitWorkArea & emptyBitsMask) != 0) {
             throw new IllegalArgumentException(
-                "Last encoded character (before the paddings if any) is a valid base 32 alphabet but not a possible value. " +
-                "Expected the discarded bits to be zero.");
+                "Strict decoding: Last encoded character (before the paddings if any) is a valid base 32 alphabet but not a possible encoding. " +
+                "Expected the discarded bits from the character to be zero.");
         }
     }
 }
diff --git a/src/main/java/org/apache/commons/codec/binary/Base64.java b/src/main/java/org/apache/commons/codec/binary/Base64.java
@@ -470,8 +470,8 @@ void decode(final byte[] in, int inPos, final int inAvail, final Context context
             // Output all whole multiples of 8 bits and ignore the rest
             switch (context.modulus) {
 //              case 0 : // impossible, as excluded above
-                case 1 : // 6 bits - ignore entirely
-                    // TODO not currently tested; perhaps it is impossible?
+                case 1 : // 6 bits - either ignore entirely, or raise an exception
+                    validateTrailingCharacter();
                     break;
                 case 2 : // 12 bits = 8 + 4
                     validateCharacter(MASK_4BITS, context);
@@ -786,6 +786,20 @@ protected boolean isInAlphabet(final byte octet) {
         return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
     }
 
+    /**
+     * Validates whether decoding allows an entire final trailing character that cannot be
+     * used for a complete byte.
+     *
+     * @throws IllegalArgumentException if strict decoding is enabled
+     */
+    private void validateTrailingCharacter() {
+        if (isStrictDecoding()) {
+            throw new IllegalArgumentException(
+                "Strict decoding: Last encoded character (before the paddings if any) is a valid base 64 alphabet but not a possible encoding. " +
+                "Decoding requries at least two trailing 6-bit characters to create bytes.");
+        }
+    }
+
     /**
      * Validates whether decoding the final trailing character is possible in the context
      * of the set of possible base 64 values.
@@ -798,11 +812,11 @@ protected boolean isInAlphabet(final byte octet) {
      *
      * @throws IllegalArgumentException if the bits being checked contain any non-zero value
      */
-    private static void validateCharacter(final int emptyBitsMask, final Context context) {
-        if ((context.ibitWorkArea & emptyBitsMask) != 0) {
+    private void validateCharacter(final int emptyBitsMask, final Context context) {
+        if (isStrictDecoding() && (context.ibitWorkArea & emptyBitsMask) != 0) {
             throw new IllegalArgumentException(
-                "Last encoded character (before the paddings if any) is a valid base 64 alphabet but not a possible value. " +
-                "Expected the discarded bits to be zero.");
+                "Strict decoding: Last encoded character (before the paddings if any) is a valid base 64 alphabet but not a possible encoding. " +
+                "Expected the discarded bits from the character to be zero.");
         }
     }
 }
diff --git a/src/main/java/org/apache/commons/codec/binary/BaseNCodec.java b/src/main/java/org/apache/commons/codec/binary/BaseNCodec.java
@@ -23,6 +23,7 @@
 import org.apache.commons.codec.BinaryEncoder;
 import org.apache.commons.codec.DecoderException;
 import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.binary.BaseNCodec.Context;
 
 /**
  * Abstract superclass for Base-N encoders and decoders.
@@ -190,6 +191,12 @@ public String toString() {
      */
     private final int chunkSeparatorLength;
 
+    /**
+     * If true then decoding should throw an exception for impossible combinations of bits at the
+     * end of the byte input. The default is to decode as much of them as possible.
+     */
+    private boolean strictDecoding;
+
     /**
      * Note {@code lineLength} is rounded down to the nearest multiple of the encoded block size.
      * If {@code chunkSeparatorLength} is zero, then chunking is disabled.
@@ -223,6 +230,47 @@ protected BaseNCodec(final int unencodedBlockSize, final int encodedBlockSize,
         this.pad = pad;
     }
 
+    /**
+     * Sets the decoding behaviour when the input bytes contain leftover trailing bits that
+     * cannot be created by a valid encoding. These can be bits that are unused from the final
+     * character or entire characters. The default mode is lenient decoding. Set this to
+     * {@code true} to enable strict decoding.
+     * <ul>
+     * <li>Lenient: Any trailing bits are composed into 8-bit bytes where possible.
+     *     The remainder are discarded.
+     * <li>Strict: The decoding will raise an {@link IllegalArgumentException} if trailing bits
+     *     are not part of a valid encoding. Any unused bits from the final character must
+     *     be zero. Impossible counts of entire final characters are not allowed.
+     * </ul>
+     *
+     * <p>When strict decoding is enabled it is expected that the decoded bytes will be re-encoded
+     * to a byte array that matches the original, i.e. no changes occur on the final
+     * character. This requires that the input bytes use the same padding and alphabet
+     * as the encoder.
+     *
+     * @param strictDecoding Set to true to enable strict decoding; otherwise use lenient decoding.
+     * @see #encode(byte[])
+     * @since 1.15
+     */
+    public void setStrictDecoding(boolean strictDecoding) {
+        this.strictDecoding = strictDecoding;
+    }
+
+    /**
+     * Returns true if decoding behaviour is strict. Decoding will raise an
+     * {@link IllegalArgumentException} if trailing bits are not part of a valid encoding.
+     *
+     * <p>The default is false for lenient encoding. Decoding will compose trailing bits
+     * into 8-bit bytes and discard the remainder.
+     *
+     * @return true if using strict decoding
+     * @see #setStrictDecoding(boolean)
+     * @since 1.15
+     */
+    public boolean isStrictDecoding() {
+        return strictDecoding;
+    }
+
     /**
      * Returns true if this object has buffered data for reading.
      *

diff --git a/src/main/java/org/apache/commons/codec/net/BCodec.java b/src/main/java/org/apache/commons/codec/net/BCodec.java
@@ -48,6 +48,12 @@ public class BCodec extends RFC1522Codec implements StringEncoder, StringDecoder
      */
     private final Charset charset;
 
+    /**
+     * If true then decoding should throw an exception for impossible combinations of bits at the
+     * end of the byte input. The default is to decode as much of them as possible.
+     */
+    private boolean strictDecoding;
+
     /**
      * Default constructor.
      */
@@ -82,6 +88,40 @@ public BCodec(final String charsetName) {
         this(Charset.forName(charsetName));
     }
 
+    /**
+     * Sets the decoding behaviour when the input bytes contain leftover trailing bits that
+     * cannot be created by a valid Base64 encoding. This setting is transferred to the instance
+     * of {@link Base64} used to perform decoding.
+     *
+     * <p>The default is false for lenient encoding. Decoding will compose trailing bits
+     * into 8-bit bytes and discard the remainder.
+     *
+     * <p>Set to true to enable strict decoding. Decoding will raise a
+     * {@link DecoderException} if trailing bits are not part of a valid Base64 encoding.
+     *
+     * @param strictDecoding Set to true to enable strict decoding; otherwise use lenient decoding.
+     * @see org.apache.commons.codec.binary.BaseNCodec#setStrictDecoding(boolean) BaseNCodec.setStrictDecoding(boolean)
+     * @since 1.15
+     */
+    public void setStrictDecoding(boolean strictDecoding) {
+        this.strictDecoding = strictDecoding;
+    }
+
+    /**
+     * Returns true if decoding behaviour is strict. Decoding will raise a
+     * {@link DecoderException} if trailing bits are not part of a valid Base64 encoding.
+     *
+     * <p>The default is false for lenient encoding. Decoding will compose trailing bits
+     * into 8-bit bytes and discard the remainder.
+     *
+     * @return true if using strict decoding
+     * @see #setStrictDecoding(boolean)
+     * @since 1.15
+     */
+    public boolean isStrictDecoding() {
+        return strictDecoding;
+    }
+
     @Override
     protected String getEncoding() {
         return "B";
@@ -100,7 +140,9 @@ protected byte[] doDecoding(final byte[] bytes) {
         if (bytes == null) {
             return null;
         }
-        return Base64.decodeBase64(bytes);
+        final Base64 codec = new Base64();
+        codec.setStrictDecoding(strictDecoding);
+        return codec.decode(bytes);
     }
 
     /**

diff --git a/src/test/java/org/apache/commons/codec/binary/Base32Test.java b/src/test/java/org/apache/commons/codec/binary/Base32Test.java
@@ -19,8 +19,10 @@
 package org.apache.commons.codec.binary;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.nio.charset.Charset;
@@ -299,6 +301,7 @@ public void testBase32HexImpossibleSamples() {
     }
 
     private void testImpossibleCases(final Base32 codec, final String[] impossible_cases) {
+        codec.setStrictDecoding(true);
         for (final String impossible : impossible_cases) {
             try {
                 codec.decode(impossible);
@@ -309,6 +312,11 @@ private void testImpossibleCases(final Base32 codec, final String[] impossible_c
         }
     }
 
+    @Test
+    public void testBase32DecodingOfTrailing5Bits() {
+        assertBase32DecodingOfTrailingBits(5);
+    }
+
     @Test
     public void testBase32DecodingOfTrailing10Bits() {
         assertBase32DecodingOfTrailingBits(10);
@@ -349,31 +357,49 @@ public void testBase32DecodingOfTrailing35Bits() {
      */
     private static void assertBase32DecodingOfTrailingBits(final int nbits) {
         final Base32 codec = new Base32();
-        // Create the encoded bytes. The first characters must be valid so fill with 'zero'.
-        final byte[] encoded = new byte[nbits / 5];
-        Arrays.fill(encoded, ENCODE_TABLE[0]);
+        // Requires strict decoding
+        codec.setStrictDecoding(true);
+        assertTrue(codec.isStrictDecoding());
+        // A lenient decoder should not re-encode to the same bytes
+        final Base32 defaultCodec = new Base32();
+        assertFalse(defaultCodec.isStrictDecoding());
+
+        // Create the encoded bytes. The first characters must be valid so fill with 'zero'
+        // then pad to the block size.
+        final int length = nbits / 5;
+        final byte[] encoded = new byte[8];
+        Arrays.fill(encoded, 0, length, ENCODE_TABLE[0]);
+        Arrays.fill(encoded, length, encoded.length, (byte) '=');
         // Compute how many bits would be discarded from 8-bit bytes
         final int discard = nbits % 8;
         final int emptyBitsMask = (1 << discard) - 1;
+        // Special case when an impossible number of trailing characters
+        final boolean invalid = length == 1 || length == 3 || length == 6;
         // Enumerate all 32 possible final characters in the last position
-        final int last = encoded.length - 1;
+        final int last = length - 1;
         for (int i = 0; i < 32; i++) {
             encoded[last] = ENCODE_TABLE[i];
             // If the lower bits are set we expect an exception. This is not a valid
             // final character.
-            if ((i & emptyBitsMask) != 0) {
+            if (invalid || (i & emptyBitsMask) != 0) {
                 try {
                     codec.decode(encoded);
                     fail("Final base-32 digit should not be allowed");
                 } catch (final IllegalArgumentException ex) {
                     // expected
                 }
+                // The default lenient mode should decode this
+                final byte[] decoded = defaultCodec.decode(encoded);
+                // Re-encoding should not match the original array as it was invalid
+                assertFalse(Arrays.equals(encoded, defaultCodec.encode(decoded)));
             } else {
                 // Otherwise this should decode
                 final byte[] decoded = codec.decode(encoded);
                 // Compute the bits that were encoded. This should match the final decoded byte.
                 final int bitsEncoded = i >> discard;
                 assertEquals("Invalid decoding of last character", bitsEncoded, decoded[decoded.length - 1]);
+                // Re-encoding should match the original array (requires the same padding character)
+                assertArrayEquals(encoded, codec.encode(decoded));
             }
         }
     }