Skip to content

Commit

Permalink
[SPARK-14878][SQL] Trim characters string function support
Browse files Browse the repository at this point in the history
#### What changes were proposed in this pull request?

This PR enhances the TRIM function support in Spark SQL by allowing the specification
of trim characters set. Below is the SQL syntax :

``` SQL
<trim function> ::= TRIM <left paren> <trim operands> <right paren>
<trim operands> ::= [ [ <trim specification> ] [ <trim character set> ] FROM ] <trim source>
<trim source> ::= <character value expression>
<trim specification> ::=
  LEADING
| TRAILING
| BOTH
<trim character set> ::= <characters value expression>
```
or
``` SQL
LTRIM (source-exp [, trim-exp])
RTRIM (source-exp [, trim-exp])
```

Here are the documentation link of support of this feature by other mainstream databases.
- **Oracle:** [TRIM function](http://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2126.htm#OLADM704)
- **DB2:** [TRIM scalar function](https://www.ibm.com/support/knowledgecenter/en/SSMKHH_10.0.0/com.ibm.etools.mft.doc/ak05270_.htm)
- **MySQL:** [Trim function](http://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_trim)
- **Oracle:** [ltrim](https://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2018.htm#OLADM594)
- **DB2:** [ltrim](https://www.ibm.com/support/knowledgecenter/en/SSEPEK_11.0.0/sqlref/src/tpc/db2z_bif_ltrim.html)

This PR is to implement the above enhancement. In the implementation, the design principle is to keep the changes to the minimum. Also, the exiting trim functions (which handles a special case, i.e., trimming space characters) are kept unchanged for performane reasons.
#### How was this patch tested?

The unit test cases are added in the following files:
- UTF8StringSuite.java
- StringExpressionsSuite.scala
- sql/SQLQuerySuite.scala
- StringFunctionsSuite.scala

Author: Kevin Yu <qyu@us.ibm.com>

Closes #12646 from kevinyu98/spark-14878.
  • Loading branch information
kevinyu98 authored and gatorsmile committed Sep 18, 2017
1 parent 3b049ab commit c66d64b
Show file tree
Hide file tree
Showing 10 changed files with 554 additions and 32 deletions.
Expand Up @@ -511,6 +511,21 @@ public UTF8String trim() {
}
}

/**
* Based on the given trim string, trim this string starting from both ends
* This method searches for each character in the source string, removes the character if it is found
* in the trim string, stops at the first not found. It calls the trimLeft first, then trimRight.
* It returns a new string in which both ends trim characters have been removed.
* @param trimString the trim character string
*/
public UTF8String trim(UTF8String trimString) {
if (trimString != null) {
return trimLeft(trimString).trimRight(trimString);
} else {
return null;
}
}

public UTF8String trimLeft() {
int s = 0;
// skip all of the space (0x20) in the left side
Expand All @@ -523,6 +538,40 @@ public UTF8String trimLeft() {
}
}

/**
* Based on the given trim string, trim this string starting from left end
* This method searches each character in the source string starting from the left end, removes the character if it
* is in the trim string, stops at the first character which is not in the trim string, returns the new string.
* @param trimString the trim character string
*/
public UTF8String trimLeft(UTF8String trimString) {
if (trimString == null) return null;
// the searching byte position in the source string
int srchIdx = 0;
// the first beginning byte position of a non-matching character
int trimIdx = 0;

while (srchIdx < numBytes) {
UTF8String searchChar = copyUTF8String(srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
int searchCharBytes = searchChar.numBytes;
// try to find the matching for the searchChar in the trimString set
if (trimString.find(searchChar, 0) >= 0) {
trimIdx += searchCharBytes;
} else {
// no matching, exit the search
break;
}
srchIdx += searchCharBytes;
}

if (trimIdx >= numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(trimIdx, numBytes - 1);
}
}

public UTF8String trimRight() {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
Expand All @@ -536,6 +585,50 @@ public UTF8String trimRight() {
}
}

/**
* Based on the given trim string, trim this string starting from right end
* This method searches each character in the source string starting from the right end, removes the character if it
* is in the trim string, stops at the first character which is not in the trim string, returns the new string.
* @param trimString the trim character string
*/
public UTF8String trimRight(UTF8String trimString) {
if (trimString == null) return null;
int charIdx = 0;
// number of characters from the source string
int numChars = 0;
// array of character length for the source string
int[] stringCharLen = new int[numBytes];
// array of the first byte position for each character in the source string
int[] stringCharPos = new int[numBytes];
// build the position and length array
while (charIdx < numBytes) {
stringCharPos[numChars] = charIdx;
stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
charIdx += stringCharLen[numChars];
numChars ++;
}

// index trimEnd points to the first no matching byte position from the right side of the source string.
int trimEnd = numBytes - 1;
while (numChars > 0) {
UTF8String searchChar =
copyUTF8String(stringCharPos[numChars - 1], stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
if (trimString.find(searchChar, 0) >= 0) {
trimEnd -= stringCharLen[numChars - 1];
} else {
break;
}
numChars --;
}

if (trimEnd < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(0, trimEnd);
}
}

public UTF8String reverse() {
byte[] result = new byte[this.numBytes];

Expand Down
Expand Up @@ -730,4 +730,61 @@ public void testToLong() throws IOException {
assertFalse(negativeInput, UTF8String.fromString(negativeInput).toLong(wrapper));
}
}

@Test
public void trimBothWithTrimString() {
assertEquals(fromString("hello"), fromString(" hello ").trim(fromString(" ")));
assertEquals(fromString("o"), fromString(" hello ").trim(fromString(" hle")));
assertEquals(fromString("h e"), fromString("ooh e ooo").trim(fromString("o ")));
assertEquals(fromString(""), fromString("ooo...oooo").trim(fromString("o.")));
assertEquals(fromString("b"), fromString("%^b[]@").trim(fromString("][@^%")));

assertEquals(EMPTY_UTF8, fromString(" ").trim(fromString(" ")));

assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
assertEquals(fromString("数"), fromString("a数b").trim(fromString("ab")));
assertEquals(fromString(""), fromString("a").trim(fromString("a数b")));
assertEquals(fromString(""), fromString("数数 数数数").trim(fromString("数 ")));
assertEquals(fromString("据砖头"), fromString("数]数[数据砖头#数数").trim(fromString("[数]#")));
assertEquals(fromString("据砖头数数 "), fromString("数数数据砖头数数 ").trim(fromString("数")));
}

@Test
public void trimLeftWithTrimString() {
assertEquals(fromString(" hello "), fromString(" hello ").trimLeft(fromString("")));
assertEquals(fromString(""), fromString("a").trimLeft(fromString("a")));
assertEquals(fromString("b"), fromString("b").trimLeft(fromString("a")));
assertEquals(fromString("ba"), fromString("ba").trimLeft(fromString("a")));
assertEquals(fromString(""), fromString("aaaaaaa").trimLeft(fromString("a")));
assertEquals(fromString("trim"), fromString("oabtrim").trimLeft(fromString("bao")));
assertEquals(fromString("rim "), fromString("ooootrim ").trimLeft(fromString("otm")));

assertEquals(EMPTY_UTF8, fromString(" ").trimLeft(fromString(" ")));

assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft(fromString(" ")));
assertEquals(fromString("数"), fromString("数").trimLeft(fromString("a")));
assertEquals(fromString("a"), fromString("a").trimLeft(fromString("数")));
assertEquals(fromString("砖头数数"), fromString("数数数据砖头数数").trimLeft(fromString("据数")));
assertEquals(fromString("据砖头数数"), fromString(" 数数数据砖头数数").trimLeft(fromString("数 ")));
assertEquals(fromString("据砖头数数"), fromString("aa数数数据砖头数数").trimLeft(fromString("a数砖")));
assertEquals(fromString("$S,.$BR"), fromString(",,,,%$S,.$BR").trimLeft(fromString("%,")));
}

@Test
public void trimRightWithTrimString() {
assertEquals(fromString(" hello "), fromString(" hello ").trimRight(fromString("")));
assertEquals(fromString(""), fromString("a").trimRight(fromString("a")));
assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
assertEquals(fromString(" he"), fromString(" hello ").trimRight(fromString(" ol")));
assertEquals(fromString("oohell"), fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));

assertEquals(EMPTY_UTF8, fromString(" ").trimRight(fromString(" ")));

assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight(fromString(" ")));
assertEquals(fromString("数数砖头"), fromString("数数砖头数aa数").trimRight(fromString("a数")));
assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
}
}
Expand Up @@ -580,6 +580,8 @@ primaryExpression
| '(' query ')' #subqueryExpression
| qualifiedName '(' (setQuantifier? argument+=expression (',' argument+=expression)*)? ')'
(OVER windowSpec)? #functionCall
| qualifiedName '(' trimOption=(BOTH | LEADING | TRAILING) argument+=expression
FROM argument+=expression ')' #functionCall
| value=primaryExpression '[' index=valueExpression ']' #subscript
| identifier #columnReference
| base=primaryExpression '.' fieldName=identifier #dereference
Expand Down Expand Up @@ -748,6 +750,7 @@ nonReserved
| UNBOUNDED | WHEN
| DATABASE | SELECT | FROM | WHERE | HAVING | TO | TABLE | WITH | NOT | CURRENT_DATE | CURRENT_TIMESTAMP
| DIRECTORY
| BOTH | LEADING | TRAILING
;

SELECT: 'SELECT';
Expand Down Expand Up @@ -861,6 +864,9 @@ COMMIT: 'COMMIT';
ROLLBACK: 'ROLLBACK';
MACRO: 'MACRO';
IGNORE: 'IGNORE';
BOTH: 'BOTH';
LEADING: 'LEADING';
TRAILING: 'TRAILING';

IF: 'IF';
POSITION: 'POSITION';
Expand Down

0 comments on commit c66d64b

Please sign in to comment.