Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into 8223
Browse files Browse the repository at this point in the history
# Conflicts:
#	sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
  • Loading branch information
tarekbecker committed Jul 2, 2015
2 parents 5189690 + b285ac5 commit 3b56f2a
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
> weights
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2.8366423
Expand All @@ -245,13 +246,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights =
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights =
coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
> weights
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
Expand All @@ -278,12 +280,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
> weights
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -0.05627428
Expand All @@ -310,13 +313,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
intercept=FALSE))
> weights
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
Expand All @@ -343,12 +347,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
> weights
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.15021751
Expand All @@ -375,13 +380,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
intercept=FALSE))
> weights
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
Expand All @@ -408,12 +414,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
> weights
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.57734851
Expand All @@ -440,13 +447,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
intercept=FALSE))
> weights
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
Expand Down Expand Up @@ -503,12 +511,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
/*
Using the following R code to load the data and train the model using glmnet package.
> library("glmnet")
> data <- read.csv("path", header=FALSE)
> label = factor(data$V1)
> features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
> weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
> weights
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
weights = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
weights
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -0.2480643
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ object FunctionRegistry {
expression[Substring]("substr"),
expression[Substring]("substring"),
expression[Upper]("ucase"),
expression[UnHex]("unhex"),
expression[Upper]("upper")
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,58 @@ case class ShiftRight(left: Expression, right: Expression) extends BinaryExpress
override def toString: String = s"ShiftRight($left, $right)"
}

/**
* Performs the inverse operation of HEX.
* Resulting characters are returned as a byte array.
*/
case class UnHex(child: Expression) extends UnaryExpression with Serializable {

override def dataType: DataType = BinaryType

override def checkInputDataTypes(): TypeCheckResult = {
if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
TypeCheckResult.TypeCheckSuccess
} else {
TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
}
}

override def eval(input: InternalRow): Any = {
val num = child.eval(input)
if (num == null) {
null
} else {
unhex(num.asInstanceOf[UTF8String].getBytes)
}
}

private val unhexDigits = {
val array = Array.fill[Byte](128)(-1)
(0 to 9).foreach(i => array('0' + i) = i.toByte)
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
array
}

private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
var bytes = inputBytes
if ((bytes.length & 0x01) != 0) {
bytes = '0'.toByte +: bytes
}
val out = new Array[Byte](bytes.length >> 1)
// two characters form the hex value.
var i = 0
while (i < bytes.length) {
val first = unhexDigits(bytes(i))
val second = unhexDigits(bytes(i + 1))
if (first == -1 || second == -1) { return null}
out(i / 2) = (((first << 4) | second) & 0xFF).toByte
i += 2
}
out
}
}

case class Hypot(left: Expression, right: Expression)
extends BinaryMathExpression(math.hypot, "HYPOT")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// scalastyle:on
}

test("unhex") {
checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
}

test("hypot") {
testBinary(Hypot, math.hypot)
}
Expand Down
18 changes: 18 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,24 @@ object functions {
*/
def hex(colName: String): Column = hex(Column(colName))

/**
* Inverse of hex. Interprets each pair of characters as a hexadecimal number
* and converts to the byte representation of number.
*
* @group math_funcs
* @since 1.5.0
*/
def unhex(column: Column): Column = UnHex(column.expr)

/**
* Inverse of hex. Interprets each pair of characters as a hexadecimal number
* and converts to the byte representation of number.
*
* @group math_funcs
* @since 1.5.0
*/
def unhex(colName: String): Column = unhex(Column(colName))

/**
* Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest {
checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F")))
}

test("unhex") {
val data = Seq(("1C", "737472696E67")).toDF("a", "b")
checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte)))
checkAnswer(data.select(unhex('b)), Row("string".getBytes))
checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte)))
checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes))
checkAnswer(data.selectExpr("""unhex("##")"""), Row(null))
checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null))
}

test("hypot") {
testTwoToOneMathFunction(hypot, hypot, math.hypot)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_trim",
"udf_ucase",
"udf_unix_timestamp",
"udf_unhex",
"udf_upper",
"udf_var_pop",
"udf_var_samp",
Expand Down

0 comments on commit 3b56f2a

Please sign in to comment.