From 4e4f74b5e1267d1ada4a8f57b86aee0d9c17d90a Mon Sep 17 00:00:00 2001 From: Rosstin Date: Wed, 1 Jul 2015 21:42:06 -0700 Subject: [PATCH 1/2] [SPARK-8660] [MLLIB] removed > symbols from comments in LogisticRegressionSuite.scala for ease of copypaste '>' symbols removed from comments in LogisticRegressionSuite.scala, for ease of copypaste also single-lined the multiline commands (is this desirable, or does it violate style?) Author: Rosstin Closes #7167 from Rosstin/SPARK-8660-2 and squashes the following commits: f4b9bc8 [Rosstin] SPARK-8660 restored character limit on multiline comments in LogisticRegressionSuite.scala fe6b112 [Rosstin] SPARK-8660 > symbols removed from LogisticRegressionSuite.scala for easy of copypaste 39ddd50 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8661 5a05dee [Rosstin] SPARK-8661 for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments to make it easier to copy-paste the R code. bb9a4b1 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8660 242aedd [Rosstin] SPARK-8660, changed comment style from JavaDoc style to normal multiline comment in order to make copypaste into R easier, in file classification/LogisticRegressionSuite.scala 2cd2985 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 21ac1e5 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 6c18058 [Rosstin] fixed minor typos in docs/README.md and docs/api.md --- .../LogisticRegressionSuite.scala | 117 ++++++++++-------- 1 file changed, 63 insertions(+), 54 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index bc6eeac1db5da..ba8fbee84197c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 2.8366423 @@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE)) - > weights + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . @@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) -0.05627428 @@ -310,13 +313,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, intercept=FALSE)) - > weights + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . @@ -343,12 +347,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 0.15021751 @@ -375,13 +380,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, intercept=FALSE)) - > weights + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . @@ -408,12 +414,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 0.57734851 @@ -440,13 +447,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21, + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21, intercept=FALSE)) - > weights + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . @@ -503,12 +511,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { /* Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0)) - > weights + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0)) + weights + 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) -0.2480643 From b285ac5ba85fe0b32b00726ad7d3a2efb602e885 Mon Sep 17 00:00:00 2001 From: "zhichao.li" Date: Wed, 1 Jul 2015 22:19:51 -0700 Subject: [PATCH 2/2] [SPARK-8227] [SQL] Add function unhex cc chenghao-intel adrian-wang Author: zhichao.li Closes #7113 from zhichao-li/unhex and squashes the following commits: 379356e [zhichao.li] remove exception checking a4ae6dc [zhichao.li] add udf_unhex to whitelist fe5c14a [zhichao.li] add todigit 607d7a3 [zhichao.li] use checkInputTypes bffd37f [zhichao.li] change to use Hex in apache common package cde73f5 [zhichao.li] update to use AutoCastInputTypes 11945c7 [zhichao.li] style c852d46 [zhichao.li] Add function unhex --- .../catalyst/analysis/FunctionRegistry.scala | 1 + .../spark/sql/catalyst/expressions/math.scala | 52 +++++++++++++++++++ .../expressions/MathFunctionsSuite.scala | 6 +++ .../org/apache/spark/sql/functions.scala | 18 +++++++ .../spark/sql/MathExpressionsSuite.scala | 10 ++++ .../execution/HiveCompatibilitySuite.scala | 1 + 6 files changed, 88 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d53eaedda56b0..6f04298d4711b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -157,6 +157,7 @@ object FunctionRegistry { expression[Substring]("substr"), expression[Substring]("substring"), expression[Upper]("ucase"), + expression[UnHex]("unhex"), expression[Upper]("upper") ) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala index b51318dd5044c..8633eb06ffee4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala @@ -351,6 +351,58 @@ case class Pow(left: Expression, right: Expression) } } +/** + * Performs the inverse operation of HEX. + * Resulting characters are returned as a byte array. + */ +case class UnHex(child: Expression) extends UnaryExpression with Serializable { + + override def dataType: DataType = BinaryType + + override def checkInputDataTypes(): TypeCheckResult = { + if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) { + TypeCheckResult.TypeCheckSuccess + } else { + TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}") + } + } + + override def eval(input: InternalRow): Any = { + val num = child.eval(input) + if (num == null) { + null + } else { + unhex(num.asInstanceOf[UTF8String].getBytes) + } + } + + private val unhexDigits = { + val array = Array.fill[Byte](128)(-1) + (0 to 9).foreach(i => array('0' + i) = i.toByte) + (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte) + (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte) + array + } + + private def unhex(inputBytes: Array[Byte]): Array[Byte] = { + var bytes = inputBytes + if ((bytes.length & 0x01) != 0) { + bytes = '0'.toByte +: bytes + } + val out = new Array[Byte](bytes.length >> 1) + // two characters form the hex value. + var i = 0 + while (i < bytes.length) { + val first = unhexDigits(bytes(i)) + val second = unhexDigits(bytes(i + 1)) + if (first == -1 || second == -1) { return null} + out(i / 2) = (((first << 4) | second) & 0xFF).toByte + i += 2 + } + out + } +} + case class Hypot(left: Expression, right: Expression) extends BinaryMathExpression(math.hypot, "HYPOT") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala index b932d4ab850c7..b3345d7069159 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala @@ -238,6 +238,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:on } + test("unhex") { + checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes) + checkEvaluation(UnHex(Literal("")), new Array[Byte](0)) + checkEvaluation(UnHex(Literal("0")), Array[Byte](0)) + } + test("hypot") { testBinary(Hypot, math.hypot) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 4e8f3f96bf4db..e6f623bdf39eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1053,6 +1053,24 @@ object functions { */ def hex(colName: String): Column = hex(Column(colName)) + /** + * Inverse of hex. Interprets each pair of characters as a hexadecimal number + * and converts to the byte representation of number. + * + * @group math_funcs + * @since 1.5.0 + */ + def unhex(column: Column): Column = UnHex(column.expr) + + /** + * Inverse of hex. Interprets each pair of characters as a hexadecimal number + * and converts to the byte representation of number. + * + * @group math_funcs + * @since 1.5.0 + */ + def unhex(colName: String): Column = unhex(Column(colName)) + /** * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala index d6331aa4ff09e..c03cde38d75d0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala @@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest { checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F"))) } + test("unhex") { + val data = Seq(("1C", "737472696E67")).toDF("a", "b") + checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte))) + checkAnswer(data.select(unhex('b)), Row("string".getBytes)) + checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte))) + checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes)) + checkAnswer(data.selectExpr("""unhex("##")"""), Row(null)) + checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null)) + } + test("hypot") { testTwoToOneMathFunction(hypot, hypot, math.hypot) } diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index f88e62763ca70..415a81644c58f 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_trim", "udf_ucase", "udf_unix_timestamp", + "udf_unhex", "udf_upper", "udf_var_pop", "udf_var_samp",