From 491ce7b536c4c9d73b38acc5cd05f2ecfaeeae1f Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 16 Jun 2015 22:23:21 +0800 Subject: [PATCH 1/6] add ascii/base64/unbase64/encode/decode functions --- .../catalyst/analysis/FunctionRegistry.scala | 5 + .../expressions/stringOperations.scala | 129 ++++++++++++++++++ .../expressions/StringFunctionsSuite.scala | 58 +++++++- .../org/apache/spark/sql/functions.scala | 93 +++++++++++++ .../spark/sql/DataFrameFunctionsSuite.scala | 35 +++++ 5 files changed, 315 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 9163b032adee4..ee13cbf08d17e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -155,11 +155,16 @@ object FunctionRegistry { expression[Sum]("sum"), // string functions + expression[Ascii]("ascii"), + expression[Base64]("base64"), + expression[Encode]("encode"), + expression[Decode]("decode"), expression[Lower]("lcase"), expression[Lower]("lower"), expression[StringLength]("length"), expression[Substring]("substr"), expression[Substring]("substring"), + expression[UnBase64]("unbase64"), expression[Upper]("ucase"), expression[UnHex]("unhex"), expression[Upper]("upper") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 57918b32f8a47..c4d879eca2493 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import java.util.regex.Pattern +import java.nio.charset._ import org.apache.spark.sql.catalyst.analysis.UnresolvedException import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -298,3 +299,131 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI override def prettyName: String = "length" } + +/** + * Returns the numeric value of the first character of str. + */ +case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTypes { + override def dataType: DataType = IntegerType + override def expectedChildTypes: Seq[DataType] = Seq(StringType) + + override def eval(input: InternalRow): Any = { + val string = child.eval(input) + if (string == null) { + null + } else { + val bytes = string.asInstanceOf[UTF8String].getBytes + if (bytes.length > 0) { + bytes(0).asInstanceOf[Int] + } else { + 0 + } + } + } + + override def toString: String = s"ascii($child)" +} + +/** + * Converts the argument from binary to a base 64 string. + */ +case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTypes { + override def dataType: DataType = StringType + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) + + override def eval(input: InternalRow): Any = { + val bytes = child.eval(input) + if (bytes == null) { + null + } else { + UTF8String.fromBytes( + org.apache.commons.codec.binary.Base64.encodeBase64( + bytes.asInstanceOf[Array[Byte]])) + } + } + + override def toString: String = s"base64($child)" +} + +/** + * Converts the argument from a base 64 string to BINARY. + */ +case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInputTypes { + override def dataType: DataType = BinaryType + override def expectedChildTypes: Seq[DataType] = Seq(StringType) + + override def eval(input: InternalRow): Any = { + val string = child.eval(input) + if (string == null) { + null + } else { + org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString) + } + } + + override def toString: String = s"unbase64($child)" +} + +/** + * Decodes the first argument into a String using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. (As of Hive 0.12.0.). + */ +case class Decode(bin: Expression, charset: Expression) + extends Expression with ExpectsInputTypes { + override def children: Seq[Expression] = bin :: charset :: Nil + override def foldable: Boolean = bin.foldable && charset.foldable + override def nullable: Boolean = bin.nullable || charset.nullable + override def dataType: DataType = StringType + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType, StringType) + + override def eval(input: InternalRow): Any = { + val l = bin.eval(input) + if (l == null) { + null + } else { + val r = charset.eval(input) + if (r == null) { + null + } else { + val fromCharset = r.asInstanceOf[UTF8String].toString + UTF8String.fromString(new String(l.asInstanceOf[Array[Byte]], fromCharset)) + } + } + } + + override def toString: String = s"decode($bin, $charset)" +} + +/** +* Encodes the first argument into a BINARY using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. (As of Hive 0.12.0.) +*/ +case class Encode(value: Expression, charset: Expression) + extends Expression with ExpectsInputTypes { + override def children: Seq[Expression] = value :: charset :: Nil + override def foldable: Boolean = value.foldable && charset.foldable + override def nullable: Boolean = value.nullable || charset.nullable + override def dataType: DataType = BinaryType + override def expectedChildTypes: Seq[DataType] = Seq(StringType, StringType) + + override def eval(input: InternalRow): Any = { + val l = value.eval(input) + if (l == null) { + null + } else { + val r = charset.eval(input) + if (r == null) { + null + } else { + val toCharset = r.asInstanceOf[UTF8String].toString + l.asInstanceOf[UTF8String].toString.getBytes(toCharset) + } + } + } + + override def toString: String = s"encode($value, $charset)" +} + + diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala index 5dbb1d562c1d9..76bb4e76d7b11 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.types.{IntegerType, StringType} +import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType} class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { @@ -217,11 +217,59 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("length for string") { - val regEx = 'a.string.at(0) + val a = 'a.string.at(0) checkEvaluation(StringLength(Literal("abc")), 3, create_row("abdef")) - checkEvaluation(StringLength(regEx), 5, create_row("abdef")) - checkEvaluation(StringLength(regEx), 0, create_row("")) - checkEvaluation(StringLength(regEx), null, create_row(null)) + checkEvaluation(StringLength(a), 5, create_row("abdef")) + checkEvaluation(StringLength(a), 0, create_row("")) + checkEvaluation(StringLength(a), null, create_row(null)) checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef")) } + + test("ascii for string") { + val a = 'a.string.at(0) + checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef")) + checkEvaluation(Ascii(a), 97, create_row("abdef")) + checkEvaluation(Ascii(a), 0, create_row("")) + checkEvaluation(Ascii(a), null, create_row(null)) + checkEvaluation(Ascii(Literal.create(null, StringType)), null, create_row("abdef")) + } + + test("base64/unbase64 for string") { + val a = 'a.string.at(0) + val b = 'b.binary.at(0) + + checkEvaluation(Base64(Literal(Array[Byte](1,2,3,4))), "AQIDBA==", create_row("abdef")) + checkEvaluation(Base64(UnBase64(Literal("AQIDBA=="))), "AQIDBA==", create_row("abdef")) + checkEvaluation(Base64(UnBase64(Literal(""))), "", create_row("abdef")) + checkEvaluation(Base64(UnBase64(Literal.create(null, StringType))), null, create_row("abdef")) + checkEvaluation(Base64(UnBase64(a)), "AQIDBA==", create_row("AQIDBA==")) + + checkEvaluation(Base64(b), "AQIDBA==", create_row(Array[Byte](1,2,3,4))) + checkEvaluation(Base64(b), "", create_row(Array[Byte]())) + checkEvaluation(Base64(b), null, create_row(null)) + checkEvaluation(Base64(Literal.create(null, StringType)), null, create_row("abdef")) + + checkEvaluation(UnBase64(a), null, create_row(null)) + checkEvaluation(UnBase64(Literal.create(null, StringType)), null, create_row("abdef")) + } + + test("encode/decode for string") { + val a = 'a.string.at(0) + val b = 'b.binary.at(0) + + checkEvaluation( + Decode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界") + checkEvaluation( + Decode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界")) + checkEvaluation( + Decode(Encode(a, Literal("utf-8")), Literal("utf-8")), "", create_row("")) + + checkEvaluation(Encode(a, Literal("utf-8")), null, create_row(null)) + checkEvaluation(Encode(Literal.create(null, StringType), Literal("utf-8")), null) + checkEvaluation(Encode(a, Literal.create(null, StringType)), null, create_row("")) + + checkEvaluation(Decode(b, Literal("utf-8")), null, create_row(null)) + checkEvaluation(Decode(Literal.create(null, BinaryType), Literal("utf-8")), null) + checkEvaluation(Decode(b, Literal.create(null, StringType)), null, create_row(null)) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 0d5d49c3dd1d7..774569317e0f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1543,6 +1543,7 @@ object functions { /** * Computes the length of a given string value + * * @group string_funcs * @since 1.5.0 */ @@ -1550,11 +1551,103 @@ object functions { /** * Computes the length of a given string column + * * @group string_funcs * @since 1.5.0 */ def strlen(columnName: String): Column = strlen(Column(columnName)) + /** + * Computes the numeric value of the first character of the specified string value. + * + * @group string_funcs + * @since 1.5.0 + */ + def ascii(e: Column): Column = Ascii(e.expr) + + /** + * Computes the numeric value of the first character of the specified string column. + * + * @group string_funcs + * @since 1.5.0 + */ + def ascii(columnName: String): Column = ascii(Column(columnName)) + + /** + * Computes the specified value from binary to a base 64 string. + * + * @group string_funcs + * @since 1.5.0 + */ + def base64(e: Column): Column = Base64(e.expr) + + /** + * Computes the specified column from binary to a base 64 string. + * + * @group string_funcs + * @since 1.5.0 + */ + def base64(columnName: String): Column = base64(Column(columnName)) + + /** + * Computes the specified value from a base 64 string to binary. + * + * @group string_funcs + * @since 1.5.0 + */ + def unbase64(e: Column): Column = UnBase64(e.expr) + + /** + * Computes the specified column from a base 64 string to binary. + * + * @group string_funcs + * @since 1.5.0 + */ + def unbase64(columnName: String): Column = unbase64(Column(columnName)) + + /** + * Computes the first argument into a binary from a string using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def encode(value: Column, charset: Column): Column = Encode(value.expr, charset.expr) + + /** + * Computes the first argument into a binary from a string using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def encode(columnName: String, charsetColumnName: String): Column = + encode(Column(columnName), Column(charsetColumnName)) + + /** + * Computes the first argument into a string from a binary using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def decode(value: Column, charset: Column): Column = Decode(value.expr, charset.expr) + + /** + * Computes the first argument into a string from a binary using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def decode(columnName: String, charsetColumnName: String): Column = + decode(Column(columnName), Column(charsetColumnName)) + + ////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 0d43aca877f68..3e3c6b18f1e46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -225,4 +225,39 @@ class DataFrameFunctionsSuite extends QueryTest { Row(l) }) } + + test("string ascii function") { + val df = Seq(("abc", "")).toDF("a", "b") + checkAnswer( + df.select(ascii($"a"), ascii("b")), + Row(97, 0)) + + checkAnswer( + df.selectExpr("ascii(a)", "ascii(b)"), + Row(97, 0)) + } + + test("string base64/unbase64 function") { + val bytes = Array[Byte](1, 2, 3, 4) + val df = Seq((bytes, "AQIDBA==")).toDF("a", "b") + checkAnswer( + df.select(base64("a"), base64($"a"), unbase64("b"), unbase64($"b")), + Row("AQIDBA==", "AQIDBA==", bytes, bytes)) + + checkAnswer( + df.selectExpr("base64(a)", "unbase64(b)"), + Row("AQIDBA==", bytes)) + } + + test("string encode/decode function") { + val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116) + val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c") + checkAnswer( + df.select(encode($"a", $"b"), encode("a", "b"), decode($"c", $"b"), decode("c", "b")), + Row(bytes, bytes, "大千世界", "大千世界")) + + checkAnswer( + df.selectExpr("encode(a, b)", "decode(c, b)"), + Row(bytes, "大千世界")) + } } From e2df76859981a9a400b63b8ddf90b60f44783c6f Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 16 Jun 2015 07:25:42 -0700 Subject: [PATCH 2/6] remove the unused import --- .../apache/spark/sql/catalyst/expressions/stringOperations.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index c4d879eca2493..e37c31552122a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.catalyst.expressions import java.util.regex.Pattern -import java.nio.charset._ import org.apache.spark.sql.catalyst.analysis.UnresolvedException import org.apache.spark.sql.catalyst.expressions.codegen._ From 96170fcd9544346428261a234db7e718efecd790 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 16 Jun 2015 08:06:38 -0700 Subject: [PATCH 3/6] scalastyle issues --- .../catalyst/expressions/StringFunctionsSuite.scala | 10 ++++++---- .../org/apache/spark/sql/DataFrameFunctionsSuite.scala | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala index 76bb4e76d7b11..468df20442d38 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala @@ -237,14 +237,15 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("base64/unbase64 for string") { val a = 'a.string.at(0) val b = 'b.binary.at(0) + val bytes = Array[Byte](1, 2, 3, 4) - checkEvaluation(Base64(Literal(Array[Byte](1,2,3,4))), "AQIDBA==", create_row("abdef")) + checkEvaluation(Base64(Literal(bytes)), "AQIDBA==", create_row("abdef")) checkEvaluation(Base64(UnBase64(Literal("AQIDBA=="))), "AQIDBA==", create_row("abdef")) checkEvaluation(Base64(UnBase64(Literal(""))), "", create_row("abdef")) checkEvaluation(Base64(UnBase64(Literal.create(null, StringType))), null, create_row("abdef")) checkEvaluation(Base64(UnBase64(a)), "AQIDBA==", create_row("AQIDBA==")) - checkEvaluation(Base64(b), "AQIDBA==", create_row(Array[Byte](1,2,3,4))) + checkEvaluation(Base64(b), "AQIDBA==", create_row(bytes)) checkEvaluation(Base64(b), "", create_row(Array[Byte]())) checkEvaluation(Base64(b), null, create_row(null)) checkEvaluation(Base64(Literal.create(null, StringType)), null, create_row("abdef")) @@ -256,14 +257,15 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("encode/decode for string") { val a = 'a.string.at(0) val b = 'b.binary.at(0) - + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. checkEvaluation( Decode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界") checkEvaluation( Decode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界")) checkEvaluation( Decode(Encode(a, Literal("utf-8")), Literal("utf-8")), "", create_row("")) - + // scalastyle:on checkEvaluation(Encode(a, Literal("utf-8")), null, create_row(null)) checkEvaluation(Encode(Literal.create(null, StringType), Literal("utf-8")), null) checkEvaluation(Encode(a, Literal.create(null, StringType)), null, create_row("")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 3e3c6b18f1e46..bd9fa400e5b34 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -251,6 +251,8 @@ class DataFrameFunctionsSuite extends QueryTest { test("string encode/decode function") { val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116) + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c") checkAnswer( df.select(encode($"a", $"b"), encode("a", "b"), decode($"c", $"b"), decode("c", "b")), @@ -259,5 +261,6 @@ class DataFrameFunctionsSuite extends QueryTest { checkAnswer( df.selectExpr("encode(a, b)", "decode(c, b)"), Row(bytes, "大千世界")) + // scalastyle:on } } From ed5c19cc5974d207340910ded2e36957b39ae377 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Thu, 18 Jun 2015 21:34:16 +0800 Subject: [PATCH 4/6] update code as comments --- .../expressions/stringOperations.scala | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index e37c31552122a..b6ba2be4f221f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -304,7 +304,7 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI */ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTypes { override def dataType: DataType = IntegerType - override def expectedChildTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[DataType] = Seq(StringType) override def eval(input: InternalRow): Any = { val string = child.eval(input) @@ -320,7 +320,7 @@ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTyp } } - override def toString: String = s"ascii($child)" + override def toString: String = s"ASCII($child)" } /** @@ -328,7 +328,7 @@ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTyp */ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTypes { override def dataType: DataType = StringType - override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) + override def inputTypes: Seq[DataType] = Seq(BinaryType) override def eval(input: InternalRow): Any = { val bytes = child.eval(input) @@ -341,7 +341,7 @@ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTy } } - override def toString: String = s"base64($child)" + override def toString: String = s"BASE64($child)" } /** @@ -349,7 +349,7 @@ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTy */ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInputTypes { override def dataType: DataType = BinaryType - override def expectedChildTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[DataType] = Seq(StringType) override def eval(input: InternalRow): Any = { val string = child.eval(input) @@ -360,7 +360,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput } } - override def toString: String = s"unbase64($child)" + override def toString: String = s"UNBASE64($child)" } /** @@ -368,13 +368,12 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * If either argument is null, the result will also be null. (As of Hive 0.12.0.). */ -case class Decode(bin: Expression, charset: Expression) - extends Expression with ExpectsInputTypes { +case class Decode(bin: Expression, charset: Expression) extends Expression with ExpectsInputTypes { override def children: Seq[Expression] = bin :: charset :: Nil override def foldable: Boolean = bin.foldable && charset.foldable override def nullable: Boolean = bin.nullable || charset.nullable override def dataType: DataType = StringType - override def expectedChildTypes: Seq[DataType] = Seq(BinaryType, StringType) + override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType) override def eval(input: InternalRow): Any = { val l = bin.eval(input) @@ -391,11 +390,11 @@ case class Decode(bin: Expression, charset: Expression) } } - override def toString: String = s"decode($bin, $charset)" + override def toString: String = s"DECODE($bin, $charset)" } /** -* Encodes the first argument into a BINARY using the provided character set + * Encodes the first argument into a BINARY using the provided character set * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * If either argument is null, the result will also be null. (As of Hive 0.12.0.) */ @@ -405,7 +404,7 @@ case class Encode(value: Expression, charset: Expression) override def foldable: Boolean = value.foldable && charset.foldable override def nullable: Boolean = value.nullable || charset.nullable override def dataType: DataType = BinaryType - override def expectedChildTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[DataType] = Seq(StringType, StringType) override def eval(input: InternalRow): Any = { val l = value.eval(input) @@ -422,7 +421,7 @@ case class Encode(value: Expression, charset: Expression) } } - override def toString: String = s"encode($value, $charset)" + override def toString: String = s"ENCODE($value, $charset)" } From 9d6f9f449aafe062359a9585d41351e6aa267378 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Fri, 3 Jul 2015 21:41:17 +0800 Subject: [PATCH 5/6] remove the toString method for expressions --- .../sql/catalyst/expressions/stringOperations.scala | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index b6ba2be4f221f..154ac3508c0c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -319,8 +319,6 @@ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTyp } } } - - override def toString: String = s"ASCII($child)" } /** @@ -340,8 +338,6 @@ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTy bytes.asInstanceOf[Array[Byte]])) } } - - override def toString: String = s"BASE64($child)" } /** @@ -359,8 +355,6 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString) } } - - override def toString: String = s"UNBASE64($child)" } /** @@ -389,8 +383,6 @@ case class Decode(bin: Expression, charset: Expression) extends Expression with } } } - - override def toString: String = s"DECODE($bin, $charset)" } /** @@ -420,8 +412,6 @@ case class Encode(value: Expression, charset: Expression) } } } - - override def toString: String = s"ENCODE($value, $charset)" } From 78dee7d9ca2e9896d82af59ba0e4130415f4559c Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Fri, 3 Jul 2015 21:44:40 +0800 Subject: [PATCH 6/6] base 64 -> base64 --- .../src/main/scala/org/apache/spark/sql/functions.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 774569317e0f1..91cce4ad1e8bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1574,7 +1574,7 @@ object functions { def ascii(columnName: String): Column = ascii(Column(columnName)) /** - * Computes the specified value from binary to a base 64 string. + * Computes the specified value from binary to a base64 string. * * @group string_funcs * @since 1.5.0 @@ -1582,7 +1582,7 @@ object functions { def base64(e: Column): Column = Base64(e.expr) /** - * Computes the specified column from binary to a base 64 string. + * Computes the specified column from binary to a base64 string. * * @group string_funcs * @since 1.5.0 @@ -1590,7 +1590,7 @@ object functions { def base64(columnName: String): Column = base64(Column(columnName)) /** - * Computes the specified value from a base 64 string to binary. + * Computes the specified value from a base64 string to binary. * * @group string_funcs * @since 1.5.0 @@ -1598,7 +1598,7 @@ object functions { def unbase64(e: Column): Column = UnBase64(e.expr) /** - * Computes the specified column from a base 64 string to binary. + * Computes the specified column from a base64 string to binary. * * @group string_funcs * @since 1.5.0