-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP][Spark-SQL] Optimize the Constant Folding for Expression #482
Changes from 12 commits
2645d4f
3c045c7
536c005
9cf0396
543ef9d
9ccefdb
b28e03a
27ea3d7
80f9f18
50444cc
29c8166
68b9fad
2f14b50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression { | |
val children = child :: ordinal :: Nil | ||
/** `Null` is returned for invalid ordinals. */ | ||
override def nullable = true | ||
override def foldable = child.foldable && ordinal.foldable | ||
override def references = children.flatMap(_.references).toSet | ||
def dataType = child.dataType match { | ||
case ArrayType(dt) => dt | ||
|
@@ -40,23 +41,27 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression { | |
override def toString = s"$child[$ordinal]" | ||
|
||
override def eval(input: Row): Any = { | ||
if (child.dataType.isInstanceOf[ArrayType]) { | ||
val baseValue = child.eval(input).asInstanceOf[Seq[_]] | ||
val o = ordinal.eval(input).asInstanceOf[Int] | ||
if (baseValue == null) { | ||
null | ||
} else if (o >= baseValue.size || o < 0) { | ||
null | ||
} else { | ||
baseValue(o) | ||
} | ||
val value = child.eval(input) | ||
if(value == null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space after |
||
null | ||
} else { | ||
val baseValue = child.eval(input).asInstanceOf[Map[Any, _]] | ||
val key = ordinal.eval(input) | ||
if (baseValue == null) { | ||
if(key == null) { | ||
null | ||
} else { | ||
baseValue.get(key).orNull | ||
if (child.dataType.isInstanceOf[ArrayType]) { | ||
val baseValue = value.asInstanceOf[Seq[_]] | ||
val o = key.asInstanceOf[Int] | ||
if (o >= baseValue.size || o < 0) { | ||
null | ||
} else { | ||
baseValue(o) | ||
} | ||
} else { | ||
val baseValue = value.asInstanceOf[Map[Any, _]] | ||
val key = ordinal.eval(input) | ||
baseValue.get(key).orNull | ||
} | ||
} | ||
} | ||
} | ||
|
@@ -69,7 +74,8 @@ case class GetField(child: Expression, fieldName: String) extends UnaryExpressio | |
type EvaluatedType = Any | ||
|
||
def dataType = field.dataType | ||
def nullable = field.nullable | ||
override def nullable = field.nullable | ||
override def foldable = child.foldable | ||
|
||
protected def structType = child.dataType match { | ||
case s: StructType => s | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.types._ | |
object Optimizer extends RuleExecutor[LogicalPlan] { | ||
val batches = | ||
Batch("ConstantFolding", Once, | ||
NullPropagation, | ||
ConstantFolding, | ||
BooleanSimplification, | ||
SimplifyFilters, | ||
|
@@ -85,6 +86,71 @@ object ColumnPruning extends Rule[LogicalPlan] { | |
} | ||
} | ||
|
||
/** | ||
* Replaces [[catalyst.expressions.Expression Expressions]] that can be statically evaluated with | ||
* equivalent [[catalyst.expressions.Literal Literal]] values. This rule is more specific with | ||
* Null value propagation from bottom to top of the expression tree. | ||
*/ | ||
object NullPropagation extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
case q: LogicalPlan => q transformExpressionsUp { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
case e @ Count(Literal(null, _)) => Literal(0, e.dataType) | ||
case e @ Sum(Literal(c, _)) if(c == 0) => Literal(0, e.dataType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not like the previous implementation, I didn't add the null value as Child expression for both Sum and Average in the rule, cause Hive will throw semantic exception. |
||
case e @ Average(Literal(c, _)) if(c == 0) => Literal(0.0, e.dataType) | ||
case e @ IsNull(c) if c.nullable == false => Literal(false, BooleanType) | ||
case e @ IsNotNull(c) if c.nullable == false => Literal(true, BooleanType) | ||
case e @ GetItem(Literal(null, _), _) => Literal(null, e.dataType) | ||
case e @ GetItem(_, Literal(null, _)) => Literal(null, e.dataType) | ||
case e @ GetField(Literal(null, _), _) => Literal(null, e.dataType) | ||
case e @ Coalesce(children) => { | ||
val newChildren = children.filter(c => c match { | ||
case Literal(null, _) => false | ||
case _ => true | ||
}) | ||
if(newChildren.length == 0) { | ||
Literal(null, e.dataType) | ||
} else if(newChildren.length == 1) { | ||
newChildren(0) | ||
} else { | ||
Coalesce(newChildren) | ||
} | ||
} | ||
case e @ If(Literal(v, _), trueValue, falseValue) => if(v == true) trueValue else falseValue | ||
case e @ In(Literal(v, _), list) if(list.exists(c => c match { | ||
case Literal(candidate, _) if(candidate == v) => true | ||
case _ => false | ||
})) => Literal(true, BooleanType) | ||
case e: UnaryMinus => e.child match { | ||
case Literal(null, _) => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
case e: Cast => e.child match { | ||
case Literal(null, _) => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
case e: Not => e.child match { | ||
case Literal(null, _) => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
// Put exceptional cases above if any | ||
case e: BinaryArithmetic => e.children match { | ||
case Literal(null, _) :: right :: Nil => Literal(null, e.dataType) | ||
case left :: Literal(null, _) :: Nil => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
case e: BinaryComparison => e.children match { | ||
case Literal(null, _) :: right :: Nil => Literal(null, e.dataType) | ||
case left :: Literal(null, _) :: Nil => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
case e: StringRegexExpression => e.children match { | ||
case Literal(null, _) :: right :: Nil => Literal(null, e.dataType) | ||
case left :: Literal(null, _) :: Nil => Literal(null, e.dataType) | ||
case _ => e | ||
} | ||
} | ||
} | ||
} | ||
/** | ||
* Replaces [[catalyst.expressions.Expression Expressions]] that can be statically evaluated with | ||
* equivalent [[catalyst.expressions.Literal Literal]] values. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the rationale for changing all of these?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think "def boolean" / "def string" should return an Attribute with nullability = true, that does not harm for the correctness, otherwise, it may bring a wrong hint for the optimization of rule NullPropagation.
For example:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point.