From 161fc252ac1106a934378061002e2f47064c3bab Mon Sep 17 00:00:00 2001 From: jean-claude cote Date: Sat, 14 May 2016 18:02:57 -0400 Subject: [PATCH 1/2] DRILL-4573 Fixed issue with regexp_replace function --- .../expr/fn/impl/CharSequenceWrapper.java | 57 ++++++++++++++++++- .../exec/expr/fn/impl/StringFunctions.java | 28 +++++++-- .../expr/fn/impl/TestStringFunctions.java | 13 +++++ 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java index 6c475ed503c..fb6dc3cc81d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java @@ -17,13 +17,30 @@ */ package org.apache.drill.exec.expr.fn.impl; +import java.util.regex.Matcher; + import io.netty.buffer.DrillBuf; +/** + * A CharSequence is a readable sequence of char values. This interface provides + * uniform, read-only access to many different kinds of char sequences. A char + * value represents a character in the Basic Multilingual Plane (BMP) or a + * surrogate. Refer to Unicode Character Representation for details.
+ * Specifically this implementation of the CharSequence adapts a Drill + * {@link DrillBuf} to the CharSequence. The implementation is meant to be + * re-used that is allocated once and then passed DrillBuf to adapt. This can be + * handy to exploit API that consume CharSequence avoiding the need to create + * string objects. + * + */ public class CharSequenceWrapper implements CharSequence { + // The adapted drill buffer + private DrillBuf buffer; + // The start offset into the drill buffer private int start; + // The end offset into the drill buffer private int end; - private DrillBuf buffer; @Override public int length() { @@ -32,18 +49,54 @@ public int length() { @Override public char charAt(int index) { - return (char) buffer.getByte(start + index); + int charPos = start + index; + if(charPos < start || charPos >= end) + { + throw new IndexOutOfBoundsException(); + } + // Get the char within the DrillBuf. + return (char) buffer.getByte(charPos); } + /** + * When using the Java regex {@link Matcher} the subSequence is only called + * when capturing groups. Drill does not currently use capture groups in the + * UDF so this method is not required.
+ * It could be implemented by creating a new CharSequenceWrapper however + * this would imply newly allocated objects which is what this wrapper tries + * to avoid. + * + */ @Override public CharSequence subSequence(int start, int end) { throw new UnsupportedOperationException("Not implemented."); } + /** + * Set the DrillBuf to adapt to a CharSequence. This method can be used to + * replace any previous DrillBuf thus avoiding recreating the + * CharSequenceWrapper and thus re-using the CharSequenceWrapper object. + * + * @param start + * @param end + * @param buffer + */ public void setBuffer(int start, int end, DrillBuf buffer) { this.start = start; this.end = end; this.buffer = buffer; } + /** + * In cases where a method like {@link Matcher#replaceAll(String)} is used this + * {@link CharSequence#toString()} method will be called, so we need to return a + * string representing the value of this CharSequence. Note however that the + * regexp_replace function is implemented in a way to avoid the call to toString() + * not to uselessly create a string object. + */ + @Override + public String toString() { + return StringFunctionHelpers.toStringFromUTF8(start, end, buffer); + } + } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 0ce1c4e2d72..6846fdea81d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -240,14 +240,30 @@ public void setup() { public void eval() { out.start = 0; charSequenceWrapper.setBuffer(input.start, input.end, input.buffer); + final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer); // Reusing same charSequenceWrapper, no need to pass it in. - // This saves one method call since reset(CharSequence) calls reset() matcher.reset(); - final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer); - final byte [] bytea = matcher.replaceAll(r).getBytes(java.nio.charset.Charset.forName("UTF-8")); - out.buffer = buffer = buffer.reallocIfNeeded(bytea.length); - out.buffer.setBytes(out.start, bytea); - out.end = bytea.length; + // Implementation of Matcher.replaceAll() in-lined to avoid creating String object + // in cases where we don't actually replace anything. + boolean result = matcher.find(); + if (result) { + StringBuffer sb = new StringBuffer(); + do { + matcher.appendReplacement(sb, r); + result = matcher.find(); + } while (result); + matcher.appendTail(sb); + final byte [] bytea = sb.toString().getBytes(java.nio.charset.Charset.forName("UTF-8")); + out.buffer = buffer = buffer.reallocIfNeeded(bytea.length); + out.buffer.setBytes(out.start, bytea); + out.end = bytea.length; + } + else { + // There is no matches, copy the input bytes into the output buffer + out.buffer = buffer = buffer.reallocIfNeeded(input.end - input.start); + out.buffer.setBytes(0, input.buffer, input.start, input.end - input.start); + out.end = input.end - input.start; + } } } diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java index 2efab3bbdc4..fe5daf96ecb 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java @@ -113,6 +113,19 @@ public void testRegexpMatches() throws Exception { .run(); } + @Test + public void testRegexpReplace() throws Exception { + testBuilder() + .sqlQuery("select regexp_replace(a, 'a|c', 'x') res1, regexp_replace(b, 'd', 'zzz') res2 " + + "from (values('abc', 'bcd'), ('bcd', 'abc')) as t(a,b)") + .unOrdered() + .baselineColumns("res1", "res2") + .baselineValues("xbx", "bczzz") + .baselineValues("bxd", "abc") + .build() + .run(); + } + @Test public void testILike() throws Exception { testBuilder() From 84846241bf8beef2a090414aacda2991911022d7 Mon Sep 17 00:00:00 2001 From: jean-claude cote Date: Wed, 25 May 2016 22:11:58 -0400 Subject: [PATCH 2/2] DRILL-4573 test if bytes are all ascii and apply strategy accordingly --- .../expr/fn/impl/CharSequenceWrapper.java | 151 ++++++++++++++++-- .../exec/expr/fn/impl/StringFunctions.java | 8 +- 2 files changed, 142 insertions(+), 17 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java index fb6dc3cc81d..0b5e5bbc0f4 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java @@ -17,6 +17,12 @@ */ package org.apache.drill.exec.expr.fn.impl; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; import java.util.regex.Matcher; import io.netty.buffer.DrillBuf; @@ -35,12 +41,28 @@ */ public class CharSequenceWrapper implements CharSequence { - // The adapted drill buffer + // The adapted drill buffer (in the case of US-ASCII) private DrillBuf buffer; + // The converted bytes in the case of non ASCII + private CharBuffer charBuffer; + // initial char buffer capacity + private static final int INITIAL_CHAR_BUF = 1024; + // The decoder to use in the case of non ASCII + private CharsetDecoder decoder; + // The start offset into the drill buffer private int start; // The end offset into the drill buffer private int end; + // Indicates that the current byte buffer contains only ascii chars + private boolean usAscii; + + public CharSequenceWrapper(){ + } + + public CharSequenceWrapper(int start, int end, DrillBuf buffer){ + setBuffer(start, end, buffer); + } @Override public int length() { @@ -49,13 +71,14 @@ public int length() { @Override public char charAt(int index) { - int charPos = start + index; - if(charPos < start || charPos >= end) - { - throw new IndexOutOfBoundsException(); + if(usAscii){ + // Each byte is a char, the index is relative to the start of the original buffer + return (char) (buffer.getByte(start + index) & 0x00FF); + } + else{ + // The char buffer is a copy so the index directly corresponds + return charBuffer.charAt(index); } - // Get the char within the DrillBuf. - return (char) buffer.getByte(charPos); } /** @@ -69,7 +92,7 @@ public char charAt(int index) { */ @Override public CharSequence subSequence(int start, int end) { - throw new UnsupportedOperationException("Not implemented."); + throw new UnsupportedOperationException(); } /** @@ -82,21 +105,117 @@ public CharSequence subSequence(int start, int end) { * @param buffer */ public void setBuffer(int start, int end, DrillBuf buffer) { - this.start = start; - this.end = end; - this.buffer = buffer; + // Test if buffer is an ASCII string or not. + usAscii = isAscii(start, end, buffer); + + if(usAscii){ + // each byte equals one char + this.start = start; + this.end = end; + this.buffer = buffer; + } + else { + initCharBuffer(); + // Wrap with java byte buffer + ByteBuffer byteBuf = buffer.nioBuffer(start, end-start); + while(charBuffer.capacity() < Integer.MAX_VALUE){ + byteBuf.mark(); + if(decodeUT8(byteBuf)){ + break; + } + // Failed to convert because the char buffer was not large enough + growCharBuffer(); + // Make sure to reset the byte buffer we need to reprocess it + byteBuf.reset(); + } + this.start = 0; + this.end = charBuffer.position(); + // reset the char buffer so the index are relative to the start of the buffer + charBuffer.rewind(); + } + } + + /** + * Test if the buffer contains only ASCII bytes. + * @param start + * @param end + * @param buffer + * @return + */ + private boolean isAscii(int start, int end, DrillBuf buffer) { + for (int i = start; i <= end; i++) { + byte bb = buffer.getByte(i); + if (bb < 0) { + //System.out.printf("Not a ASCII byte 0x%02X\n", bb); + return false; + } + } + return true; + } + + /** + * Initialize the charbuffer and decoder if they are not yet initialized. + */ + private void initCharBuffer() { + if(charBuffer == null){ + charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF); + } + if(decoder == null){ + decoder = Charset.forName("UTF-8").newDecoder(); + } + } + + /** + * Decode the buffer using the CharsetDecoder. + * @param byteBuf + * @return false if failed because the charbuffer was not big enough + * @throws RuntimeExcepction if it fails for encoding errors + */ + private boolean decodeUT8(ByteBuffer byteBuf) { + // We give it all of the input data in call. + boolean endOfInput = true; + decoder.reset(); + charBuffer.rewind(); + // Convert utf-8 bytes to sequence of chars + CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput); + if(result.isOverflow()){ + // Not enough space in the charBuffer. + return false; + } else if(result.isError()) { + // Any other error + try { + result.throwException(); + } catch (CharacterCodingException e) { + throw new RuntimeException(e); + } + } + return true; + } + + /** + * Grow the charbuffer making sure not to overflow size integer. Note + * this grows in the same manner as the ArrayList that is it adds 50% + * to the current size. + */ + private void growCharBuffer() { + // overflow-conscious code + int oldCapacity = charBuffer.capacity(); + //System.out.println("old capacity " + oldCapacity); + int newCapacity = oldCapacity + (oldCapacity >> 1); + if (newCapacity < 0){ + newCapacity = Integer.MAX_VALUE; + } + //System.out.println("new capacity " + newCapacity); + charBuffer = CharBuffer.allocate(newCapacity); } /** - * In cases where a method like {@link Matcher#replaceAll(String)} is used this - * {@link CharSequence#toString()} method will be called, so we need to return a - * string representing the value of this CharSequence. Note however that the - * regexp_replace function is implemented in a way to avoid the call to toString() + * The regexp_replace function is implemented in a way to avoid the call to toString() * not to uselessly create a string object. */ @Override public String toString() { - return StringFunctionHelpers.toStringFromUTF8(start, end, buffer); + throw new UnsupportedOperationException(); } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 6846fdea81d..8f562b71687 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -284,7 +284,13 @@ public static class RegexpMatches implements DrillSimpleFunc { @Override public void setup() { - matcher = java.util.regex.Pattern.compile(org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(pattern.start, pattern.end, pattern.buffer)).matcher(""); + CharSequenceWrapper patternWrapper = new CharSequenceWrapper(pattern.start, pattern.end, pattern.buffer); + char[] chars = new char[patternWrapper.length()]; + for(int i=0; i