From 4e4f74b5e1267d1ada4a8f57b86aee0d9c17d90a Mon Sep 17 00:00:00 2001
From: Rosstin <asterazul@gmail.com>
Date: Wed, 1 Jul 2015 21:42:06 -0700
Subject: [PATCH 1/2] [SPARK-8660] [MLLIB] removed > symbols from comments in
 LogisticRegressionSuite.scala for ease of copypaste

'>' symbols removed from comments in LogisticRegressionSuite.scala, for ease of copypaste

also single-lined the multiline commands (is this desirable, or does it violate style?)

Author: Rosstin <asterazul@gmail.com>

Closes #7167 from Rosstin/SPARK-8660-2 and squashes the following commits:

f4b9bc8 [Rosstin] SPARK-8660 restored character limit on multiline comments in LogisticRegressionSuite.scala
fe6b112 [Rosstin] SPARK-8660 > symbols removed from LogisticRegressionSuite.scala for easy of copypaste
39ddd50 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8661
5a05dee [Rosstin] SPARK-8661 for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments to make it easier to copy-paste the R code.
bb9a4b1 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8660
242aedd [Rosstin] SPARK-8660, changed comment style from JavaDoc style to normal multiline comment in order to make copypaste into R easier, in file classification/LogisticRegressionSuite.scala
2cd2985 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639
21ac1e5 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639
6c18058 [Rosstin] fixed minor typos in docs/README.md and docs/api.md
---
 .../LogisticRegressionSuite.scala             | 117 ++++++++++--------
 1 file changed, 63 insertions(+), 54 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index bc6eeac1db5da..ba8fbee84197c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
        (Intercept)  2.8366423
@@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights =
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights =
            coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                            s0
        (Intercept)   .
@@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept) -0.05627428
@@ -310,13 +313,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -343,12 +347,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)  0.15021751
@@ -375,13 +380,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -408,12 +414,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)  0.57734851
@@ -440,13 +447,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
            intercept=FALSE))
-       > weights
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept)   .
@@ -503,12 +511,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     /*
        Using the following R code to load the data and train the model using glmnet package.
 
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
-       > weights
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
+       weights
+
        5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
        (Intercept) -0.2480643

From b285ac5ba85fe0b32b00726ad7d3a2efb602e885 Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Wed, 1 Jul 2015 22:19:51 -0700
Subject: [PATCH 2/2] [SPARK-8227] [SQL] Add function unhex

cc chenghao-intel  adrian-wang

Author: zhichao.li <zhichao.li@intel.com>

Closes #7113 from zhichao-li/unhex and squashes the following commits:

379356e [zhichao.li] remove exception checking
a4ae6dc [zhichao.li] add udf_unhex to whitelist
fe5c14a [zhichao.li] add todigit
607d7a3 [zhichao.li] use checkInputTypes
bffd37f [zhichao.li] change to use Hex in apache common package
cde73f5 [zhichao.li] update to use AutoCastInputTypes
11945c7 [zhichao.li] style
c852d46 [zhichao.li] Add function unhex
---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../spark/sql/catalyst/expressions/math.scala | 52 +++++++++++++++++++
 .../expressions/MathFunctionsSuite.scala      |  6 +++
 .../org/apache/spark/sql/functions.scala      | 18 +++++++
 .../spark/sql/MathExpressionsSuite.scala      | 10 ++++
 .../execution/HiveCompatibilitySuite.scala    |  1 +
 6 files changed, 88 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index d53eaedda56b0..6f04298d4711b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -157,6 +157,7 @@ object FunctionRegistry {
     expression[Substring]("substr"),
     expression[Substring]("substring"),
     expression[Upper]("ucase"),
+    expression[UnHex]("unhex"),
     expression[Upper]("upper")
   )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index b51318dd5044c..8633eb06ffee4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -351,6 +351,58 @@ case class Pow(left: Expression, right: Expression)
   }
 }
 
+/**
+ * Performs the inverse operation of HEX.
+ * Resulting characters are returned as a byte array.
+ */
+case class UnHex(child: Expression) extends UnaryExpression with Serializable {
+
+  override def dataType: DataType = BinaryType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val num = child.eval(input)
+    if (num == null) {
+      null
+    } else {
+      unhex(num.asInstanceOf[UTF8String].getBytes)
+    }
+  }
+
+  private val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+
+  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
+    var bytes = inputBytes
+    if ((bytes.length & 0x01) != 0) {
+      bytes = '0'.toByte +: bytes
+    }
+    val out = new Array[Byte](bytes.length >> 1)
+    // two characters form the hex value.
+    var i = 0
+    while (i < bytes.length) {
+        val first = unhexDigits(bytes(i))
+        val second = unhexDigits(bytes(i + 1))
+        if (first == -1 || second == -1) { return null}
+        out(i / 2) = (((first << 4) | second) & 0xFF).toByte
+        i += 2
+    }
+    out
+  }
+}
+
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index b932d4ab850c7..b3345d7069159 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -238,6 +238,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
+  test("unhex") {
+    checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
+    checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
+    checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
+  }
+
   test("hypot") {
     testBinary(Hypot, math.hypot)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4e8f3f96bf4db..e6f623bdf39eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1053,6 +1053,24 @@ object functions {
    */
   def hex(colName: String): Column = hex(Column(colName))
 
+  /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(column: Column): Column = UnHex(column.expr)
+
+  /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(colName: String): Column = unhex(Column(colName))
+
   /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index d6331aa4ff09e..c03cde38d75d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest {
     checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F")))
   }
 
+  test("unhex") {
+    val data = Seq(("1C", "737472696E67")).toDF("a", "b")
+    checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.select(unhex('b)), Row("string".getBytes))
+    checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes))
+    checkAnswer(data.selectExpr("""unhex("##")"""), Row(null))
+    checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null))
+  }
+
   test("hypot") {
     testTwoToOneMathFunction(hypot, hypot, math.hypot)
   }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index f88e62763ca70..415a81644c58f 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_trim",
     "udf_ucase",
     "udf_unix_timestamp",
+    "udf_unhex",
     "udf_upper",
     "udf_var_pop",
     "udf_var_samp",