From c6e84152de4cdd65aa90234cb5554ea669c89a5d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 11 Mar 2016 13:09:50 +0800 Subject: [PATCH 1/4] Force expression children to be canonicalized before being canonicalized itself. --- .../apache/spark/sql/catalyst/expressions/Expression.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 692c16092fe3f..16a1b2aee2730 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -152,7 +152,10 @@ abstract class Expression extends TreeNode[Expression] { * `deterministic` expressions where `this.canonicalized == other.canonicalized` will always * evaluate to the same result. */ - lazy val canonicalized: Expression = Canonicalize.execute(this) + lazy val canonicalized: Expression = { + val canonicalizedChildred = children.map(_.canonicalized) + Canonicalize.execute(withNewChildren(canonicalizedChildred)) + } /** * Returns true when two expressions will always compute the same result, even if they differ @@ -161,7 +164,7 @@ abstract class Expression extends TreeNode[Expression] { * See [[Canonicalize]] for more details. */ def semanticEquals(other: Expression): Boolean = - deterministic && other.deterministic && canonicalized == other.canonicalized + deterministic && other.deterministic && canonicalized == other.canonicalized /** * Returns a `hashCode` for the calculation performed by this expression. Unlike the standard From cbab01763cf753779ae9a8557d88e9ec80e7be59 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 11 Mar 2016 16:33:41 +0800 Subject: [PATCH 2/4] Latest change. --- .../spark/sql/catalyst/expressions/Canonicalize.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala index b58a5273041e4..df3b7a9889fcc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala @@ -36,15 +36,16 @@ import org.apache.spark.sql.catalyst.rules._ object Canonicalize extends RuleExecutor[Expression] { override protected def batches: Seq[Batch] = Batch( - "Expression Canonicalization", FixedPoint(100), + "Expression Canonicalization", Once, IgnoreNamesTypes, Reorder) :: Nil /** Remove names and nullability from types. */ protected object IgnoreNamesTypes extends Rule[Expression] { - override def apply(e: Expression): Expression = e transformUp { + override def apply(e: Expression): Expression = e match { case a: AttributeReference => AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId) + case _ => e } } @@ -64,7 +65,7 @@ object Canonicalize extends RuleExecutor[Expression] { /** Rearrange expressions that are commutative or associative. */ protected object Reorder extends Rule[Expression] { - override def apply(e: Expression): Expression = e transformUp { + override def apply(e: Expression): Expression = e match { case a: Add => orderCommutative(a, { case Add(l, r) => Seq(l, r) }).reduce(Add) case m: Multiply => orderCommutative(m, { case Multiply(l, r) => Seq(l, r) }).reduce(Multiply) @@ -76,6 +77,8 @@ object Canonicalize extends RuleExecutor[Expression] { case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l) case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l) + + case _ => e } } } From 04b3ce46889ed2b519bb705e8088814a3936a3c2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 12 Mar 2016 02:26:03 +0000 Subject: [PATCH 3/4] Address comments. --- .../catalyst/expressions/Canonicalize.scala | 84 ------------------- .../sql/catalyst/expressions/Expression.scala | 58 +++++++++++++ 2 files changed, 58 insertions(+), 84 deletions(-) delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala deleted file mode 100644 index df3b7a9889fcc..0000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions - -import org.apache.spark.sql.catalyst.rules._ - -/** - * Rewrites an expression using rules that are guaranteed preserve the result while attempting - * to remove cosmetic variations. Deterministic expressions that are `equal` after canonicalization - * will always return the same answer given the same input (i.e. false positives should not be - * possible). However, it is possible that two canonical expressions that are not equal will in fact - * return the same answer given any input (i.e. false negatives are possible). - * - * The following rules are applied: - * - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped. - * - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered - * by `hashCode`. -* - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`. - * - Other comparisons ([[GreaterThan]], [[LessThan]]) are reversed by `hashCode`. - */ -object Canonicalize extends RuleExecutor[Expression] { - override protected def batches: Seq[Batch] = - Batch( - "Expression Canonicalization", Once, - IgnoreNamesTypes, - Reorder) :: Nil - - /** Remove names and nullability from types. */ - protected object IgnoreNamesTypes extends Rule[Expression] { - override def apply(e: Expression): Expression = e match { - case a: AttributeReference => - AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId) - case _ => e - } - } - - /** Collects adjacent commutative operations. */ - protected def gatherCommutative( - e: Expression, - f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = e match { - case c if f.isDefinedAt(c) => f(c).flatMap(gatherCommutative(_, f)) - case other => other :: Nil - } - - /** Orders a set of commutative operations by their hash code. */ - protected def orderCommutative( - e: Expression, - f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = - gatherCommutative(e, f).sortBy(_.hashCode()) - - /** Rearrange expressions that are commutative or associative. */ - protected object Reorder extends Rule[Expression] { - override def apply(e: Expression): Expression = e match { - case a: Add => orderCommutative(a, { case Add(l, r) => Seq(l, r) }).reduce(Add) - case m: Multiply => orderCommutative(m, { case Multiply(l, r) => Seq(l, r) }).reduce(Multiply) - - case EqualTo(l, r) if l.hashCode() > r.hashCode() => EqualTo(r, l) - case EqualNullSafe(l, r) if l.hashCode() > r.hashCode() => EqualNullSafe(r, l) - - case GreaterThan(l, r) if l.hashCode() > r.hashCode() => LessThan(r, l) - case LessThan(l, r) if l.hashCode() > r.hashCode() => GreaterThan(r, l) - - case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l) - case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l) - - case _ => e - } - } -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 16a1b2aee2730..8ce582794479d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -601,3 +601,61 @@ abstract class TernaryExpression extends Expression { } } } + +/** + * Rewrites an expression using rules that are guaranteed preserve the result while attempting + * to remove cosmetic variations. Deterministic expressions that are `equal` after canonicalization + * will always return the same answer given the same input (i.e. false positives should not be + * possible). However, it is possible that two canonical expressions that are not equal will in fact + * return the same answer given any input (i.e. false negatives are possible). + * + * The following rules are applied: + * - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped. + * - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered + * by `hashCode`. +* - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`. + * - Other comparisons ([[GreaterThan]], [[LessThan]]) are reversed by `hashCode`. + */ +object Canonicalize extends { + def execute(e: Expression): Expression = { + expressionReorder(ignoreNamesTypes(e)) + } + + /** Remove names and nullability from types. */ + private def ignoreNamesTypes(e: Expression): Expression = e match { + case a: AttributeReference => + AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId) + case _ => e + } + + /** Collects adjacent commutative operations. */ + private def gatherCommutative( + e: Expression, + f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = e match { + case c if f.isDefinedAt(c) => f(c).flatMap(gatherCommutative(_, f)) + case other => other :: Nil + } + + /** Orders a set of commutative operations by their hash code. */ + private def orderCommutative( + e: Expression, + f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = + gatherCommutative(e, f).sortBy(_.hashCode()) + + /** Rearrange expressions that are commutative or associative. */ + private def expressionReorder(e: Expression): Expression = e match { + case a: Add => orderCommutative(a, { case Add(l, r) => Seq(l, r) }).reduce(Add) + case m: Multiply => orderCommutative(m, { case Multiply(l, r) => Seq(l, r) }).reduce(Multiply) + + case EqualTo(l, r) if l.hashCode() > r.hashCode() => EqualTo(r, l) + case EqualNullSafe(l, r) if l.hashCode() > r.hashCode() => EqualNullSafe(r, l) + + case GreaterThan(l, r) if l.hashCode() > r.hashCode() => LessThan(r, l) + case LessThan(l, r) if l.hashCode() > r.hashCode() => GreaterThan(r, l) + + case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l) + case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l) + + case _ => e + } +} From 4d3a310b5fc5f5c235299446e9ece696b1a3212b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 14 Mar 2016 06:23:27 +0000 Subject: [PATCH 4/4] For comment. --- .../catalyst/expressions/Canonicalize.scala | 76 +++++++++++++++++++ .../sql/catalyst/expressions/Expression.scala | 58 -------------- 2 files changed, 76 insertions(+), 58 deletions(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala new file mode 100644 index 0000000000000..ae1f6006135bb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +/** + * Rewrites an expression using rules that are guaranteed preserve the result while attempting + * to remove cosmetic variations. Deterministic expressions that are `equal` after canonicalization + * will always return the same answer given the same input (i.e. false positives should not be + * possible). However, it is possible that two canonical expressions that are not equal will in fact + * return the same answer given any input (i.e. false negatives are possible). + * + * The following rules are applied: + * - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped. + * - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered + * by `hashCode`. + * - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`. + * - Other comparisons ([[GreaterThan]], [[LessThan]]) are reversed by `hashCode`. + */ +object Canonicalize extends { + def execute(e: Expression): Expression = { + expressionReorder(ignoreNamesTypes(e)) + } + + /** Remove names and nullability from types. */ + private def ignoreNamesTypes(e: Expression): Expression = e match { + case a: AttributeReference => + AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId) + case _ => e + } + + /** Collects adjacent commutative operations. */ + private def gatherCommutative( + e: Expression, + f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = e match { + case c if f.isDefinedAt(c) => f(c).flatMap(gatherCommutative(_, f)) + case other => other :: Nil + } + + /** Orders a set of commutative operations by their hash code. */ + private def orderCommutative( + e: Expression, + f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = + gatherCommutative(e, f).sortBy(_.hashCode()) + + /** Rearrange expressions that are commutative or associative. */ + private def expressionReorder(e: Expression): Expression = e match { + case a: Add => orderCommutative(a, { case Add(l, r) => Seq(l, r) }).reduce(Add) + case m: Multiply => orderCommutative(m, { case Multiply(l, r) => Seq(l, r) }).reduce(Multiply) + + case EqualTo(l, r) if l.hashCode() > r.hashCode() => EqualTo(r, l) + case EqualNullSafe(l, r) if l.hashCode() > r.hashCode() => EqualNullSafe(r, l) + + case GreaterThan(l, r) if l.hashCode() > r.hashCode() => LessThan(r, l) + case LessThan(l, r) if l.hashCode() > r.hashCode() => GreaterThan(r, l) + + case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l) + case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l) + + case _ => e + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 8ce582794479d..16a1b2aee2730 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -601,61 +601,3 @@ abstract class TernaryExpression extends Expression { } } } - -/** - * Rewrites an expression using rules that are guaranteed preserve the result while attempting - * to remove cosmetic variations. Deterministic expressions that are `equal` after canonicalization - * will always return the same answer given the same input (i.e. false positives should not be - * possible). However, it is possible that two canonical expressions that are not equal will in fact - * return the same answer given any input (i.e. false negatives are possible). - * - * The following rules are applied: - * - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped. - * - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered - * by `hashCode`. -* - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`. - * - Other comparisons ([[GreaterThan]], [[LessThan]]) are reversed by `hashCode`. - */ -object Canonicalize extends { - def execute(e: Expression): Expression = { - expressionReorder(ignoreNamesTypes(e)) - } - - /** Remove names and nullability from types. */ - private def ignoreNamesTypes(e: Expression): Expression = e match { - case a: AttributeReference => - AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId) - case _ => e - } - - /** Collects adjacent commutative operations. */ - private def gatherCommutative( - e: Expression, - f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = e match { - case c if f.isDefinedAt(c) => f(c).flatMap(gatherCommutative(_, f)) - case other => other :: Nil - } - - /** Orders a set of commutative operations by their hash code. */ - private def orderCommutative( - e: Expression, - f: PartialFunction[Expression, Seq[Expression]]): Seq[Expression] = - gatherCommutative(e, f).sortBy(_.hashCode()) - - /** Rearrange expressions that are commutative or associative. */ - private def expressionReorder(e: Expression): Expression = e match { - case a: Add => orderCommutative(a, { case Add(l, r) => Seq(l, r) }).reduce(Add) - case m: Multiply => orderCommutative(m, { case Multiply(l, r) => Seq(l, r) }).reduce(Multiply) - - case EqualTo(l, r) if l.hashCode() > r.hashCode() => EqualTo(r, l) - case EqualNullSafe(l, r) if l.hashCode() > r.hashCode() => EqualNullSafe(r, l) - - case GreaterThan(l, r) if l.hashCode() > r.hashCode() => LessThan(r, l) - case LessThan(l, r) if l.hashCode() > r.hashCode() => GreaterThan(r, l) - - case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l) - case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l) - - case _ => e - } -}