From 67902e59c6a447407f09506f6a0e8f131e7df489 Mon Sep 17 00:00:00 2001 From: Shilei Date: Fri, 19 Jun 2015 14:22:43 +0800 Subject: [PATCH 1/7] Add misc funcs: sha, crc32 --- .../catalyst/analysis/FunctionRegistry.scala | 3 + .../spark/sql/catalyst/expressions/misc.scala | 59 ++++++++++++++++++- .../expressions/MiscFunctionsSuite.scala | 15 +++++ .../org/apache/spark/sql/functions.scala | 32 ++++++++++ .../spark/sql/DataFrameFunctionsSuite.scala | 25 ++++++++ 5 files changed, 132 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 457948a800a17..08eaabcab8405 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -135,6 +135,9 @@ object FunctionRegistry { // misc functions expression[Md5]("md5"), + expression[Sha]("sha"), + expression[Sha]("sha1"), + expression[Crc32]("crc32"), expression[Sha2]("sha2"), // aggregate functions diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index e80706fc65aff..98eb3a3cbd977 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -19,10 +19,10 @@ package org.apache.spark.sql.catalyst.expressions import java.security.MessageDigest import java.security.NoSuchAlgorithmException - +import java.util.zip.CRC32 import org.apache.commons.codec.digest.DigestUtils import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, DataType} +import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, DataType, LongType} import org.apache.spark.unsafe.types.UTF8String /** @@ -51,6 +51,59 @@ case class Md5(child: Expression) } } + +/** + * A function that calculates an SHA-1 digest and returns it as a hex string + * For input of type [[BinaryType]] + */ +case class Sha(child: Expression) + extends UnaryExpression with ExpectsInputTypes { + + override def dataType: DataType = StringType + + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) + + override def eval(input: InternalRow): Any = { + val value = child.eval(input) + if (value == null) { + null + } else { + UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]])) + } + } + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + defineCodeGen(ctx, ev, c => + "org.apache.spark.unsafe.types.UTF8String.fromString(" + + s"org.apache.commons.codec.digest.DigestUtils.shaHex($c))") + } + +} + +/** + * A function that computes a cyclic redundancy check value and returns it as a bigint + * For input of type [[BinaryType]] + */ +case class Crc32(child: Expression) + extends UnaryExpression with ExpectsInputTypes { + + override def dataType: DataType = LongType + + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) + + override def eval(input: InternalRow): Any = { + val value = child.eval(input) + if (value == null) { + null + } else { + val checksum = new CRC32 + checksum.update(value.asInstanceOf[Array[Byte]], 0, value.asInstanceOf[Array[Byte]].length) + checksum.getValue + } + } + +} + /** * A function that calculates the SHA-2 family of functions (SHA-224, SHA-256, SHA-384, and SHA-512) * and returns it as a hex string. The first argument is the string or binary to be hashed. The @@ -139,4 +192,6 @@ case class Sha2(left: Expression, right: Expression) } """ } + } + diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala index 38482c54c61db..242ba244d01b4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala @@ -31,6 +31,20 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Md5(Literal.create(null, BinaryType)), null) } + test("sha") { + checkEvaluation(Sha(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") + checkEvaluation(Sha(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), + "5d211bad8f4ee70e16c7d343a838fc344a1ed961") + checkEvaluation(Sha(Literal.create(null, BinaryType)), null) + } + + test("crc32") { + checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264l) + checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), + 2180413220l) + checkEvaluation(Crc32(Literal.create(null, BinaryType)), null) + } + test("sha2") { checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC")) checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)), @@ -41,4 +55,5 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Sha2(Literal("ABC".getBytes), Literal.create(null, IntegerType)), null) checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null) } + } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 355ce0e3423cf..f18e5c6432ed1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1414,6 +1414,38 @@ object functions { */ def md5(columnName: String): Column = md5(Column(columnName)) + /* + * Calculates the SHA digest and returns the value as a hex string. + * + * @group misc_funcs + * @since 1.5.0 + */ + def sha(e: Column): Column = Sha(e.expr) + + /** + * Calculates the SHA digest and returns the value as a hex string. + * + * @group misc_funcs + * @since 1.5.0 + */ + def sha(columnName: String): Column = sha(Column(columnName)) + + /** + * Calculates the cyclic redundancy check value and returns the value as a bigint. + * + * @group misc_funcs + * @since 1.5.0 + */ + def crc32(e: Column): Column = Crc32(e.expr) + + /** + * Calculates the cyclic redundancy check value and returns the value as a bigint. + * + * @group misc_funcs + * @since 1.5.0 + */ + def crc32(columnName: String): Column = crc32(Column(columnName)) + /** * Calculates the SHA-2 family of hash functions and returns the value as a hex string. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 8baed57a7f129..3d157c259d3c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -133,6 +133,7 @@ class DataFrameFunctionsSuite extends QueryTest { Row("x", "y", null)) } + test("misc md5 function") { val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") checkAnswer( @@ -144,6 +145,30 @@ class DataFrameFunctionsSuite extends QueryTest { Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c")) } + test("misc sha function") { + val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") + checkAnswer( + df.select(sha($"a"), sha("b")), + Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", + "5d211bad8f4ee70e16c7d343a838fc344a1ed961")) + + checkAnswer( + df.selectExpr("sha(a)", "sha(b)"), + Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", + "5d211bad8f4ee70e16c7d343a838fc344a1ed961")) + } + + test("misc crc32 function") { + val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") + checkAnswer( + df.select(crc32($"a"), crc32("b")), + Row(2743272264l, 2180413220l)) + + checkAnswer( + df.selectExpr("crc32(a)", "crc32(b)"), + Row(2743272264l, 2180413220l)) + } + test("misc sha2 function") { val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") checkAnswer( From 263bfed033712bc5985a0377b069d68eafd784e8 Mon Sep 17 00:00:00 2001 From: Shilei Date: Tue, 23 Jun 2015 13:59:00 +0800 Subject: [PATCH 2/7] Add sha1(alias for sha) --- .../org/apache/spark/sql/functions.scala | 50 +++++++++++++------ .../spark/sql/DataFrameFunctionsSuite.scala | 10 ++++ 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index f18e5c6432ed1..267560491f5c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1414,7 +1414,7 @@ object functions { */ def md5(columnName: String): Column = md5(Column(columnName)) - /* + /** * Calculates the SHA digest and returns the value as a hex string. * * @group misc_funcs @@ -1423,27 +1423,45 @@ object functions { def sha(e: Column): Column = Sha(e.expr) /** - * Calculates the SHA digest and returns the value as a hex string. - * - * @group misc_funcs - * @since 1.5.0 - */ + * Calculates the SHA digest and returns the value as a hex string. + * + * @group misc_funcs + * @since 1.5.0 + */ def sha(columnName: String): Column = sha(Column(columnName)) /** - * Calculates the cyclic redundancy check value and returns the value as a bigint. - * - * @group misc_funcs - * @since 1.5.0 - */ + * Calculates the SHA digest and returns the value as a hex string. + * Alias for sha. + * + * @group misc_funcs + * @since 1.5.0 + */ + def sha1(e: Column): Column = Sha(e.expr) + + /** + * Calculates the SHA digest and returns the value as a hex string. + * Alias for sha. + * + * @group misc_funcs + * @since 1.5.0 + */ + def sha1(columnName: String): Column = sha1(Column(columnName)) + + /** + * Calculates the cyclic redundancy check value and returns the value as a bigint. + * + * @group misc_funcs + * @since 1.5.0 + */ def crc32(e: Column): Column = Crc32(e.expr) /** - * Calculates the cyclic redundancy check value and returns the value as a bigint. - * - * @group misc_funcs - * @since 1.5.0 - */ + * Calculates the cyclic redundancy check value and returns the value as a bigint. + * + * @group misc_funcs + * @since 1.5.0 + */ def crc32(columnName: String): Column = crc32(Column(columnName)) /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 3d157c259d3c9..2ca48749b39a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -156,6 +156,16 @@ class DataFrameFunctionsSuite extends QueryTest { df.selectExpr("sha(a)", "sha(b)"), Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", "5d211bad8f4ee70e16c7d343a838fc344a1ed961")) + + checkAnswer( + df.select(sha1($"a"), sha1("b")), + Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", + "5d211bad8f4ee70e16c7d343a838fc344a1ed961")) + + checkAnswer( + df.selectExpr("sha1(a)", "sha1(b)"), + Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", + "5d211bad8f4ee70e16c7d343a838fc344a1ed961")) } test("misc crc32 function") { From 735fcce5ab2d3bd543d09449f5a3afdddf9716a8 Mon Sep 17 00:00:00 2001 From: Shilei Date: Wed, 24 Jun 2015 10:45:08 +0800 Subject: [PATCH 3/7] Apply sha to StringType --- .../spark/sql/catalyst/expressions/misc.scala | 21 +++++++++++++++---- .../expressions/MiscFunctionsSuite.scala | 1 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 98eb3a3cbd977..6138c1daa576a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -61,21 +61,34 @@ case class Sha(child: Expression) override def dataType: DataType = StringType - override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType, StringType) override def eval(input: InternalRow): Any = { val value = child.eval(input) if (value == null) { null } else { - UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]])) + value match { + case b: Array[Byte] => + UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]])) + case s: UTF8String => + UTF8String.fromString(DigestUtils.shaHex(s.getBytes)) + } + } } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { defineCodeGen(ctx, ev, c => - "org.apache.spark.unsafe.types.UTF8String.fromString(" + - s"org.apache.commons.codec.digest.DigestUtils.shaHex($c))") + child.dataType match { + case BinaryType => + "org.apache.spark.unsafe.types.UTF8String.fromString(" + + s"org.apache.commons.codec.digest.DigestUtils.shaHex($c))" + case StringType => + "org.apache.spark.unsafe.types.UTF8String.fromString(" + + s"org.apache.commons.codec.digest.DigestUtils.shaHex($c.getBytes()))" + } + ) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala index 242ba244d01b4..fde945fac1b79 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala @@ -32,6 +32,7 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("sha") { + checkEvaluation(Sha(Literal("ABC")), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "5d211bad8f4ee70e16c7d343a838fc344a1ed961") From c89ec137e22fe4a2fc782550443b12d397c9b914 Mon Sep 17 00:00:00 2001 From: Shilei Date: Wed, 24 Jun 2015 10:51:42 +0800 Subject: [PATCH 4/7] Remove blanks --- .../scala/org/apache/spark/sql/catalyst/expressions/misc.scala | 1 - .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 1 - 2 files changed, 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 6138c1daa576a..0af0fad5e1bbc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -74,7 +74,6 @@ case class Sha(child: Expression) case s: UTF8String => UTF8String.fromString(DigestUtils.shaHex(s.getBytes)) } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 2ca48749b39a5..764b7ba6f7865 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -133,7 +133,6 @@ class DataFrameFunctionsSuite extends QueryTest { Row("x", "y", null)) } - test("misc md5 function") { val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") checkAnswer( From a698c04d6234d16f6b51813830dfe42263cdfc41 Mon Sep 17 00:00:00 2001 From: Shilei Date: Wed, 24 Jun 2015 15:28:25 +0800 Subject: [PATCH 5/7] Remove StringType for sha --- .../spark/sql/catalyst/expressions/misc.scala | 19 ++++--------------- .../expressions/MiscFunctionsSuite.scala | 1 - 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 0af0fad5e1bbc..0484fc9a582b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -61,32 +61,21 @@ case class Sha(child: Expression) override def dataType: DataType = StringType - override def expectedChildTypes: Seq[DataType] = Seq(BinaryType, StringType) + override def expectedChildTypes: Seq[DataType] = Seq(BinaryType) override def eval(input: InternalRow): Any = { val value = child.eval(input) if (value == null) { null } else { - value match { - case b: Array[Byte] => - UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]])) - case s: UTF8String => - UTF8String.fromString(DigestUtils.shaHex(s.getBytes)) - } + UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]])) } } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { defineCodeGen(ctx, ev, c => - child.dataType match { - case BinaryType => - "org.apache.spark.unsafe.types.UTF8String.fromString(" + - s"org.apache.commons.codec.digest.DigestUtils.shaHex($c))" - case StringType => - "org.apache.spark.unsafe.types.UTF8String.fromString(" + - s"org.apache.commons.codec.digest.DigestUtils.shaHex($c.getBytes()))" - } + "org.apache.spark.unsafe.types.UTF8String.fromString(" + + s"org.apache.commons.codec.digest.DigestUtils.shaHex($c))" ) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala index fde945fac1b79..242ba244d01b4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala @@ -32,7 +32,6 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("sha") { - checkEvaluation(Sha(Literal("ABC")), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "5d211bad8f4ee70e16c7d343a838fc344a1ed961") From 37d1c424ca2cb4b7fecc15f7eff813f879b2a376 Mon Sep 17 00:00:00 2001 From: Shilei Date: Thu, 25 Jun 2015 09:15:45 +0800 Subject: [PATCH 6/7] Add codegen for crc32 --- .../catalyst/analysis/FunctionRegistry.scala | 4 ++-- .../spark/sql/catalyst/expressions/misc.scala | 17 ++++++++++++++++- .../expressions/MiscFunctionsSuite.scala | 10 +++++----- .../scala/org/apache/spark/sql/functions.scala | 4 ++-- .../spark/sql/DataFrameFunctionsSuite.scala | 4 ++-- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 08eaabcab8405..e6f238ea765bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -135,8 +135,8 @@ object FunctionRegistry { // misc functions expression[Md5]("md5"), - expression[Sha]("sha"), - expression[Sha]("sha1"), + expression[Sha1]("sha"), + expression[Sha1]("sha1"), expression[Crc32]("crc32"), expression[Sha2]("sha2"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 0484fc9a582b1..73eb3d6a9ca13 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -56,7 +56,7 @@ case class Md5(child: Expression) * A function that calculates an SHA-1 digest and returns it as a hex string * For input of type [[BinaryType]] */ -case class Sha(child: Expression) +case class Sha1(child: Expression) extends UnaryExpression with ExpectsInputTypes { override def dataType: DataType = StringType @@ -103,6 +103,21 @@ case class Crc32(child: Expression) } } + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + val value = child.gen(ctx) + val CRC32 = "java.util.zip.CRC32" + s""" + ${value.code} + boolean ${ev.isNull} = ${value.isNull}; + ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + if (!${ev.isNull}) { + ${CRC32} checksum = new ${CRC32}(); + checksum.update(${value.primitive}, 0, ${value.primitive}.length); + ${ev.primitive} = checksum.getValue(); + } + """ + } + } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala index 242ba244d01b4..878e08c50f209 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala @@ -32,16 +32,16 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("sha") { - checkEvaluation(Sha(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") - checkEvaluation(Sha(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), + checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") + checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "5d211bad8f4ee70e16c7d343a838fc344a1ed961") - checkEvaluation(Sha(Literal.create(null, BinaryType)), null) + checkEvaluation(Sha1(Literal.create(null, BinaryType)), null) } test("crc32") { - checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264l) + checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264L) checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), - 2180413220l) + 2180413220L) checkEvaluation(Crc32(Literal.create(null, BinaryType)), null) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 267560491f5c2..2a63871dba33f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1420,7 +1420,7 @@ object functions { * @group misc_funcs * @since 1.5.0 */ - def sha(e: Column): Column = Sha(e.expr) + def sha(e: Column): Column = Sha1(e.expr) /** * Calculates the SHA digest and returns the value as a hex string. @@ -1437,7 +1437,7 @@ object functions { * @group misc_funcs * @since 1.5.0 */ - def sha1(e: Column): Column = Sha(e.expr) + def sha1(e: Column): Column = Sha1(e.expr) /** * Calculates the SHA digest and returns the value as a hex string. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 764b7ba6f7865..c4d424129aecc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -171,11 +171,11 @@ class DataFrameFunctionsSuite extends QueryTest { val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b") checkAnswer( df.select(crc32($"a"), crc32("b")), - Row(2743272264l, 2180413220l)) + Row(2743272264L, 2180413220L)) checkAnswer( df.selectExpr("crc32(a)", "crc32(b)"), - Row(2743272264l, 2180413220l)) + Row(2743272264L, 2180413220L)) } test("misc sha2 function") { From 271cb52c988da922dfdb7acf5138e68c9e78861c Mon Sep 17 00:00:00 2001 From: Shilei Date: Mon, 29 Jun 2015 14:06:13 +0800 Subject: [PATCH 7/7] Optimize codegen --- .../scala/org/apache/spark/sql/catalyst/expressions/misc.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index 73eb3d6a9ca13..8243938f8a608 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -109,7 +109,7 @@ case class Crc32(child: Expression) s""" ${value.code} boolean ${ev.isNull} = ${value.isNull}; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; + long ${ev.primitive} = ${ctx.defaultValue(dataType)}; if (!${ev.isNull}) { ${CRC32} checksum = new ${CRC32}(); checksum.update(${value.primitive}, 0, ${value.primitive}.length);