-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-8784] [SQL] Add Python API for hex and unhex #7181
Closed
Closed
Changes from 2 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
1a24082
Add Python API for hex and unhex
c3af78c
address commments
25156b7
address comments and fix test
b31fc9a
Update math.scala
49e325f
Merge branch 'master' of github.com:apache/spark into hex
f032fbb
Merge branch 'hex' of github.com:davies/spark into hex
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -259,30 +259,22 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable { | |
case LongType => hex(num.asInstanceOf[Long]) | ||
case IntegerType => hex(num.asInstanceOf[Integer].toLong) | ||
case BinaryType => hex(num.asInstanceOf[Array[Byte]]) | ||
case StringType => hex(num.asInstanceOf[UTF8String]) | ||
case StringType => hex(num.asInstanceOf[UTF8String].getBytes) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Converts every character in s to two hex digits. | ||
*/ | ||
private def hex(str: UTF8String): UTF8String = { | ||
hex(str.getBytes) | ||
} | ||
private[this] val hexDigits = Array[Char]( | ||
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' | ||
).map(_.toByte) | ||
|
||
private def hex(bytes: Array[Byte]): UTF8String = { | ||
doHex(bytes, bytes.length) | ||
} | ||
|
||
private def doHex(bytes: Array[Byte], length: Int): UTF8String = { | ||
private[this] def hex(bytes: Array[Byte]): UTF8String = { | ||
val length = bytes.length | ||
val value = new Array[Byte](length * 2) | ||
var i = 0 | ||
while (i < length) { | ||
value(i * 2) = Character.toUpperCase(Character.forDigit( | ||
(bytes(i) & 0xF0) >>> 4, 16)).toByte | ||
value(i * 2 + 1) = Character.toUpperCase(Character.forDigit( | ||
bytes(i) & 0x0F, 16)).toByte | ||
value(i * 2) = hexDigits((bytes(i) & 0xF0) >> 4) | ||
value(i * 2 + 1) = hexDigits((bytes(i) & 0x0F)) | ||
i += 1 | ||
} | ||
UTF8String.fromBytes(value) | ||
|
@@ -303,6 +295,66 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable { | |
} | ||
} | ||
|
||
/** | ||
* Performs the inverse operation of HEX. | ||
* Resulting characters are returned as a byte array. | ||
*/ | ||
case class Unhex(child: Expression) | ||
extends UnaryExpression with ExpectsInputTypes with Serializable { | ||
|
||
override def nullable: Boolean = true | ||
override def dataType: DataType = BinaryType | ||
override def inputTypes: Seq[DataType] = Seq(BinaryType) | ||
|
||
override def eval(input: InternalRow): Any = { | ||
val num = child.eval(input) | ||
if (num == null) { | ||
null | ||
} else { | ||
unhex(num.asInstanceOf[UTF8String].getBytes) | ||
} | ||
} | ||
|
||
// lookup table to translate '0' -> 0 ... 'F'/'f' -> 15 | ||
private[this] val unhexDigits = { | ||
val array = Array.fill[Byte](128)(-1) | ||
(0 to 9).foreach(i => array('0' + i) = i.toByte) | ||
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte) | ||
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte) | ||
array | ||
} | ||
|
||
private[this] def unhex(bytes: Array[Byte]): Array[Byte] = { | ||
val out = new Array[Byte]((bytes.length + 1) >> 1) | ||
var i = 0 | ||
if ((bytes.length & 0x01) != 0) { | ||
// padding with '0' | ||
if (bytes(0) < 0) { | ||
return null | ||
} | ||
val v = unhexDigits(bytes(0)) | ||
if (v == -1) { | ||
return null | ||
} | ||
out(0) = v | ||
i += 1 | ||
} | ||
// two characters form the hex value. | ||
while (i < bytes.length) { | ||
if (bytes(i) < 0 || bytes(i + 1) < 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There would be exception on my previous logic when facing non-ascii character. Thanks for fixing this. |
||
return null | ||
} | ||
val first = unhexDigits(bytes(i)) | ||
val second = unhexDigits(bytes(i + 1)) | ||
if (first == -1 || second == -1) { | ||
return null | ||
} | ||
out(i / 2) = (((first << 4) | second) & 0xFF).toByte | ||
i += 2 | ||
} | ||
out | ||
} | ||
} | ||
|
||
//////////////////////////////////////////////////////////////////////////////////////////////////// | ||
//////////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
@@ -351,58 +403,6 @@ case class Pow(left: Expression, right: Expression) | |
} | ||
} | ||
|
||
/** | ||
* Performs the inverse operation of HEX. | ||
* Resulting characters are returned as a byte array. | ||
*/ | ||
case class UnHex(child: Expression) extends UnaryExpression with Serializable { | ||
|
||
override def dataType: DataType = BinaryType | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) { | ||
TypeCheckResult.TypeCheckSuccess | ||
} else { | ||
TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}") | ||
} | ||
} | ||
|
||
override def eval(input: InternalRow): Any = { | ||
val num = child.eval(input) | ||
if (num == null) { | ||
null | ||
} else { | ||
unhex(num.asInstanceOf[UTF8String].getBytes) | ||
} | ||
} | ||
|
||
private val unhexDigits = { | ||
val array = Array.fill[Byte](128)(-1) | ||
(0 to 9).foreach(i => array('0' + i) = i.toByte) | ||
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte) | ||
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte) | ||
array | ||
} | ||
|
||
private def unhex(inputBytes: Array[Byte]): Array[Byte] = { | ||
var bytes = inputBytes | ||
if ((bytes.length & 0x01) != 0) { | ||
bytes = '0'.toByte +: bytes | ||
} | ||
val out = new Array[Byte](bytes.length >> 1) | ||
// two characters form the hex value. | ||
var i = 0 | ||
while (i < bytes.length) { | ||
val first = unhexDigits(bytes(i)) | ||
val second = unhexDigits(bytes(i + 1)) | ||
if (first == -1 || second == -1) { return null} | ||
out(i / 2) = (((first << 4) | second) & 0xFF).toByte | ||
i += 2 | ||
} | ||
out | ||
} | ||
} | ||
|
||
case class Hypot(left: Expression, right: Expression) | ||
extends BinaryMathExpression(math.hypot, "HYPOT") | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions | |
|
||
import org.apache.spark.SparkFunSuite | ||
import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
import org.apache.spark.sql.types.{DataType, DoubleType, LongType} | ||
import org.apache.spark.sql.types._ | ||
|
||
class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { | ||
|
||
|
@@ -226,11 +226,15 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
} | ||
|
||
test("hex") { | ||
checkEvaluation(Hex(Literal.create(null, IntegerType)), null) | ||
checkEvaluation(Hex(Literal(28)), "1C") | ||
checkEvaluation(Hex(Literal(-28)), "FFFFFFFFFFFFFFE4") | ||
checkEvaluation(Hex(Literal.create(null, LongType)), null) | ||
checkEvaluation(Hex(Literal(100800200404L)), "177828FED4") | ||
checkEvaluation(Hex(Literal(-100800200404L)), "FFFFFFE887D7012C") | ||
checkEvaluation(Hex(Literal.create(null, StringType)), null) | ||
checkEvaluation(Hex(Literal("helloHex")), "68656C6C6F486578") | ||
checkEvaluation(Hex(Literal.create(null, BinaryType)), null) | ||
checkEvaluation(Hex(Literal("helloHex".getBytes())), "68656C6C6F486578") | ||
// scalastyle:off | ||
// Turn off scala style for non-ascii chars | ||
|
@@ -239,9 +243,15 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
} | ||
|
||
test("unhex") { | ||
checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes) | ||
checkEvaluation(UnHex(Literal("")), new Array[Byte](0)) | ||
checkEvaluation(UnHex(Literal("0")), Array[Byte](0)) | ||
checkEvaluation(Unhex(Literal.create(null, StringType)), null) | ||
checkEvaluation(Unhex(Literal("737472696E67")), "string".getBytes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a test for null literal of string type |
||
checkEvaluation(Unhex(Literal("")), new Array[Byte](0)) | ||
checkEvaluation(Unhex(Literal("F")), Array[Byte](15)) | ||
checkEvaluation(Unhex(Literal("ff")), Array[Byte](-1)) | ||
// scalastyle:off | ||
// Turn off scala style for non-ascii chars | ||
checkEvaluation(Unhex(Literal("E4B889E9878DE79A84")), "三重的".getBytes("UTF-8")) | ||
// scalastyle:on | ||
} | ||
|
||
test("hypot") { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we move the two tables into some static field in an object?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can, but putting them here is more clear. Only one object per Expression, I think it's fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why would it be more clear? are you thinking about the distance between its definition and where it is used?
this is one case where java beats scala with static fields. Ideally all the string functions should just be member functions in UTF8String.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Less code is better than more code. From performance's point of view, there is never an end to stop optimize it. I think we could go with something that's good enough (won't be the bottle neck).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how is this less code? It is just a bad idea to create unnecessary state. Just move both tables into two fields in Hex object.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW I absolutely disagree that "less code is better than more code" as an ethos. While it can be true in many cases, there are plenty of counter examples:
In this case, I don't see how this creates less code (it takes 2 lines of code to define a scala object -- you can even put it in an existing java class like in UTF8String as a static field).