Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
5809496
Add description of SQL CLI and HiveServer.
OopsOutOfMemory Oct 28, 2014
c577589
Merge branch 'master' of https://github.com/apache/spark
OopsOutOfMemory Oct 28, 2014
e654704
Merge branch 'master' of https://github.com/apache/spark
OopsOutOfMemory Oct 29, 2014
3269eab
Add system function trim
OopsOutOfMemory Oct 29, 2014
ce6899a
HiveQL support trim
OopsOutOfMemory Oct 29, 2014
e2781ee
modify keyword of Trim
OopsOutOfMemory Oct 29, 2014
2166c77
correct spelling mistake
OopsOutOfMemory Oct 29, 2014
a4f4e4b
correct spelling mistake
OopsOutOfMemory Oct 29, 2014
2137e28
add test suit for trim
OopsOutOfMemory Oct 29, 2014
10d8ace
support 3 functions : ltrim rtrim length in SparkQL and HiveQl
OopsOutOfMemory Oct 29, 2014
0a0f4e0
change the name of the trait CaseConversionExpression to StringTransf…
OopsOutOfMemory Oct 29, 2014
0fa2cd6
change return type
OopsOutOfMemory Oct 29, 2014
b358048
deleted: sql/README.md
OopsOutOfMemory Oct 29, 2014
558d7bf
new file: sql/README.md
OopsOutOfMemory Oct 29, 2014
ab29a7e
Merge branch 'sparksql' of https://github.com/OopsOutOfMemory/spark i…
OopsOutOfMemory Oct 29, 2014
57111f5
change the implementation of ltrim and trim from regex to apache comm…
OopsOutOfMemory Oct 31, 2014
b7790f4
Merge branch 'master' of https://github.com/OopsOutOfMemory/spark int…
OopsOutOfMemory Oct 31, 2014
dca6adb
change to scala native implementation
OopsOutOfMemory Nov 3, 2014
5989358
fix ident issues and add test case
OopsOutOfMemory Nov 4, 2014
0925b32
spelling correct
OopsOutOfMemory Nov 4, 2014
addfbd9
remove unused golden file
OopsOutOfMemory Nov 4, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ class SqlParser extends AbstractSparkSQLParser {
protected val UPPER = Keyword("UPPER")
protected val WHEN = Keyword("WHEN")
protected val WHERE = Keyword("WHERE")
protected val TRIM = Keyword("TRIM")
protected val LTRIM = Keyword("LTRIM")
protected val RTRIM = Keyword("RTRIM")
protected val LENGTH = Keyword("LENGTH")

// Use reflection to find the reserved words defined in this class.
protected val reservedWords =
Expand Down Expand Up @@ -283,6 +287,10 @@ class SqlParser extends AbstractSparkSQLParser {
| MAX ~ "(" ~> expression <~ ")" ^^ { case exp => Max(exp) }
| UPPER ~ "(" ~> expression <~ ")" ^^ { case exp => Upper(exp) }
| LOWER ~ "(" ~> expression <~ ")" ^^ { case exp => Lower(exp) }
| TRIM ~ "(" ~> expression <~ ")" ^^ { case exp => Trim(exp) }
| LTRIM ~ "(" ~> expression <~ ")" ^^ { case exp => Ltrim(exp) }
| RTRIM ~ "(" ~> expression <~ ")" ^^ { case exp => Rtrim(exp) }
| LENGTH ~ "(" ~> expression <~ ")" ^^ { case exp => Length(exp) }
| IF ~ "(" ~> expression ~ ("," ~> expression) ~ ("," ~> expression) <~ ")" ^^
{ case c ~ t ~ f => If(c, t, f) }
| CASE ~> expression.? ~ (WHEN ~> expression ~ (THEN ~> expression)).* ~
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.collection.IndexedSeqOptimized


import org.apache.spark.sql.catalyst.analysis.UnresolvedException
import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}
import org.apache.spark.sql.catalyst.types._

trait StringRegexExpression {
self: BinaryExpression =>
Expand Down Expand Up @@ -71,7 +71,7 @@ trait StringRegexExpression {
}
}

trait CaseConversionExpression {
trait StringTransformationExpression {
self: UnaryExpression =>

type EvaluatedType = Any
Expand All @@ -92,6 +92,34 @@ trait CaseConversionExpression {
}
}

/**
* This trait is use for string calculation that return Integer Type
* Functions that return Integer Type should use this trait
* eg: length(s), instr( string1, string2, start_position,nth_appearance )
*/

trait StringCalculationExpression {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This trait seems only used by Length, how about just merge them? I think we can extract the trait in the future if we have more string operators returns IntegerType. Sorry not sure if you're planning on this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chenghao-intel yep, what I mean is to add a trait to support return type of Integer. Because function like LENGTH(n) which compute the size of a string return Integer, INSTR(chr1,chr2,[n,[m]]) which compute a position also return Integer. But I'm not sure whether this will cause some side effect or not.

self: UnaryExpression =>

type EvaluatedType = Any

def calc(v: String): Int

override def foldable: Boolean = child.foldable
def nullable: Boolean = child.nullable
def dataType: DataType = IntegerType

override def eval(input: Row): Any = {
val evaluated = child.eval(input)
if (evaluated == null) {
null
} else {
calc(evaluated.toString)
}
}
}


/**
* Simple RegEx pattern matching function
*/
Expand Down Expand Up @@ -134,10 +162,51 @@ case class RLike(left: Expression, right: Expression)
override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
}


/**
* A function that strip whitespace (or other characters) from the beginning of a string
*/
case class Ltrim(child: Expression) extends UnaryExpression with StringTransformationExpression {

override def convert(v: String): String = v.dropWhile(_ == ' ')

override def toString() = s"Ltrim($child)"
}

/**
* A function that strip whitespace (or other characters) from the end of a string
*/
case class Rtrim(child: Expression) extends UnaryExpression with StringTransformationExpression {

override def convert(v: String): String = v.reverse.dropWhile(_ == ' ').reverse

override def toString() = s"Rtrim($child)"
}

/**
* A function that calculate the length of a string
*/
case class Length(child: Expression) extends UnaryExpression with StringCalculationExpression {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to override the override def dataType=IntegerType, it's StringType by default. That's why it causes failure in the unittest.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it is.

override def calc(v: String): Int = v.length()

override def toString() = s"Length($child)"
}

/**
* A function that trim the characters of a string
*/
case class Trim(child: Expression) extends UnaryExpression with StringTransformationExpression {

override def convert(v: String): String = v.trim()

override def toString() = s"Trim($child)"
}

/**
* A function that converts the characters of a string to uppercase.
*/
case class Upper(child: Expression) extends UnaryExpression with CaseConversionExpression {
case class Upper(child: Expression) extends UnaryExpression with StringTransformationExpression {

override def convert(v: String): String = v.toUpperCase()

Expand All @@ -147,7 +216,7 @@ case class Upper(child: Expression) extends UnaryExpression with CaseConversionE
/**
* A function that converts the characters of a string to lowercase.
*/
case class Lower(child: Expression) extends UnaryExpression with CaseConversionExpression {
case class Lower(child: Expression) extends UnaryExpression with StringTransformationExpression {

override def convert(v: String): String = v.toLowerCase()

Expand Down
50 changes: 50 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,56 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
(2, "abc"),
(3, null)))
}
test("system function trim()") {
checkAnswer(
sql("SELECT N,TRIM(L) FROM untrimmedData"),
Seq(
(1, "Good"),
(2, "To"),
(3, "See"),
(4, "You !")))

checkAnswer(
sql("SELECT n, TRIM(s) FROM nullStrings"),
Seq(
(1, "abc"),
(2, "ABC"),
(3, null)))
}

test("system function ltrim()") {
checkAnswer(
sql("SELECT N,LTRIM(L) FROM untrimmedData"),
Seq(
(1, "Good "),
(2, "To "),
(3, "See"),
(4, "You ! ")))

checkAnswer(
sql("SELECT n, LTRIM(s) FROM nullStrings"),
Seq(
(1, "abc"),
(2, "ABC"),
(3, null)))
}

test("system function rtrim()") {
checkAnswer(
sql("SELECT N,RTRIM(L) FROM untrimmedData"),
Seq(
(1, " Good"),
(2, "To"),
(3, " See"),
(4, " You !")))

checkAnswer(
sql("SELECT n, RTRIM(s) FROM nullStrings"),
Seq(
(1, "abc"),
(2, "ABC"),
(3, null)))
}

test("UNION") {
checkAnswer(
Expand Down
10 changes: 10 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ object TestData {

val emptyTableData = logical.LocalRelation('a.int, 'b.int)

case class UntrimmedData(N: Int, L: String)
val untrimmedData =
TestSQLContext.sparkContext.parallelize(
UntrimmedData(1, " Good ") ::
UntrimmedData(2, "To ") ::
UntrimmedData(3, " See") ::
UntrimmedData(4, " You ! ") :: Nil).toSchemaRDD
untrimmedData.registerTempTable("untrimmedData")


case class UpperCaseData(N: Int, L: String)
val upperCaseData =
TestSQLContext.sparkContext.parallelize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,10 @@ private[hive] object HiveQl {
val MIN = "(?i)MIN".r
val UPPER = "(?i)UPPER".r
val LOWER = "(?i)LOWER".r
val TRIM = "(?i)TRIM".r
val LTRIM = "(?i)LTRIM".r
val RTRIM = "(?i)RTRIM".r
val LENGTH = "(?i)LENGTH".r
val RAND = "(?i)RAND".r
val AND = "(?i)AND".r
val OR = "(?i)OR".r
Expand Down Expand Up @@ -918,6 +922,10 @@ private[hive] object HiveQl {
/* System functions about string operations */
case Token("TOK_FUNCTION", Token(UPPER(), Nil) :: arg :: Nil) => Upper(nodeToExpr(arg))
case Token("TOK_FUNCTION", Token(LOWER(), Nil) :: arg :: Nil) => Lower(nodeToExpr(arg))
case Token("TOK_FUNCTION", Token(TRIM(), Nil) :: arg :: Nil) => Trim(nodeToExpr(arg))
case Token("TOK_FUNCTION", Token(LTRIM(), Nil) :: arg :: Nil) => Ltrim(nodeToExpr(arg))
case Token("TOK_FUNCTION", Token(RTRIM(), Nil) :: arg :: Nil) => Rtrim(nodeToExpr(arg))
case Token("TOK_FUNCTION", Token(LENGTH(), Nil) :: arg :: Nil) => Length(nodeToExpr(arg))

/* Casts */
case Token("TOK_FUNCTION", Token("TOK_STRING", Nil) :: arg :: Nil) =>
Expand Down