From b8cb52a8a7c247df0b8647917f79cbd929a72c3d Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 22 Jan 2020 21:31:11 +0800 Subject: [PATCH 01/17] [SPARK-30555][SQL] MERGE INTO insert action should only access columns from source table ### What changes were proposed in this pull request? when resolving the `Assignment` of insert action in MERGE INTO, only resolve with the source table, to avoid ambiguous attribute failure if there is a same-name column in the target table. ### Why are the changes needed? The insert action is used when NOT MATCHED, so it can't access the row from the target table anyway. ### Does this PR introduce any user-facing change? on ### How was this patch tested? new tests Closes #27265 from cloud-fan/merge. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 27 +- .../command/PlanResolutionSuite.scala | 380 ++++++++++-------- 2 files changed, 229 insertions(+), 178 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index a96e04f043ac4..7e9f85b64e4a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1332,13 +1332,21 @@ class Analyzer( DeleteAction(resolvedDeleteCondition) case UpdateAction(updateCondition, assignments) => val resolvedUpdateCondition = updateCondition.map(resolveExpressionTopDown(_, m)) - UpdateAction(resolvedUpdateCondition, resolveAssignments(assignments, m)) + // The update value can access columns from both target and source tables. + UpdateAction( + resolvedUpdateCondition, + resolveAssignments(assignments, m, resolveValuesWithSourceOnly = false)) case o => o } val newNotMatchedActions = m.notMatchedActions.map { case InsertAction(insertCondition, assignments) => - val resolvedInsertCondition = insertCondition.map(resolveExpressionTopDown(_, m)) - InsertAction(resolvedInsertCondition, resolveAssignments(assignments, m)) + // The insert action is used when not matched, so its condition and value can only + // access columns from the source table. + val resolvedInsertCondition = + insertCondition.map(resolveExpressionTopDown(_, Project(Nil, m.sourceTable))) + InsertAction( + resolvedInsertCondition, + resolveAssignments(assignments, m, resolveValuesWithSourceOnly = true)) case o => o } val resolvedMergeCondition = resolveExpressionTopDown(m.mergeCondition, m) @@ -1353,7 +1361,8 @@ class Analyzer( def resolveAssignments( assignments: Seq[Assignment], - mergeInto: MergeIntoTable): Seq[Assignment] = { + mergeInto: MergeIntoTable, + resolveValuesWithSourceOnly: Boolean): Seq[Assignment] = { if (assignments.isEmpty) { val expandedColumns = mergeInto.targetTable.output val expandedValues = mergeInto.sourceTable.output @@ -1361,12 +1370,18 @@ class Analyzer( } else { assignments.map { assign => val resolvedKey = assign.key match { - case c if !c.resolved => resolveExpressionTopDown(c, mergeInto.targetTable) + case c if !c.resolved => + resolveExpressionTopDown(c, Project(Nil, mergeInto.targetTable)) case o => o } val resolvedValue = assign.value match { // The update values may contain target and/or source references. - case c if !c.resolved => resolveExpressionTopDown(c, mergeInto) + case c if !c.resolved => + if (resolveValuesWithSourceOnly) { + resolveExpressionTopDown(c, Project(Nil, mergeInto.sourceTable)) + } else { + resolveExpressionTopDown(c, mergeInto) + } case o => o } Assignment(resolvedKey, resolvedValue) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index abc20049735cb..8c73b366fa857 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -26,9 +26,9 @@ import org.mockito.invocation.InvocationOnMock import org.apache.spark.sql.{AnalysisException, SaveMode} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar, UnresolvedSubqueryColumnAliases, UnresolvedTable} +import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} -import org.apache.spark.sql.catalyst.expressions.{EqualTo, InSubquery, IntegerLiteral, ListQuery, StringLiteral} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, SubqueryAlias, UpdateAction, UpdateTable} import org.apache.spark.sql.connector.InMemoryTableProvider @@ -45,13 +45,13 @@ class PlanResolutionSuite extends AnalysisTest { private val table: Table = { val t = mock(classOf[Table]) - when(t.schema()).thenReturn(new StructType().add("i", "int")) + when(t.schema()).thenReturn(new StructType().add("i", "int").add("s", "string")) t } private val v1Table: V1Table = { val t = mock(classOf[CatalogTable]) - when(t.schema).thenReturn(new StructType().add("i", "int")) + when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) when(t.tableType).thenReturn(CatalogTableType.MANAGED) V1Table(t) } @@ -128,12 +128,17 @@ class PlanResolutionSuite extends AnalysisTest { catalogManagerWithoutDefault } val analyzer = new Analyzer(catalogManager, conf) + // TODO: run the analyzer directly. val rules = Seq( CTESubstitution, analyzer.ResolveRelations, analyzer.ResolveTables, new ResolveCatalogs(catalogManager), - new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v"))) + new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v")), + analyzer.ResolveTables, + analyzer.ResolveReferences, + analyzer.ResolveSubqueryColumnAliases, + analyzer.ResolveReferences) rules.foldLeft(parsePlan(query)) { case (plan, rule) => rule.apply(plan) } @@ -1055,6 +1060,49 @@ class PlanResolutionSuite extends AnalysisTest { } test("MERGE INTO TABLE") { + def checkResolution( + target: LogicalPlan, + source: LogicalPlan, + mergeCondition: Expression, + deleteCondAttr: Option[AttributeReference], + updateCondAttr: Option[AttributeReference], + insertCondAttr: Option[AttributeReference], + updateAssigns: Seq[Assignment], + insertAssigns: Seq[Assignment], + starInUpdate: Boolean = false): Unit = { + val ti = target.output.find(_.name == "i").get.asInstanceOf[AttributeReference] + val ts = target.output.find(_.name == "s").get.asInstanceOf[AttributeReference] + val si = source.output.find(_.name == "i").get.asInstanceOf[AttributeReference] + val ss = source.output.find(_.name == "s").get.asInstanceOf[AttributeReference] + + mergeCondition match { + case EqualTo(l: AttributeReference, r: AttributeReference) => + assert(l.sameRef(ti) && r.sameRef(si)) + case other => fail("unexpected merge condition " + other) + } + + deleteCondAttr.foreach(a => assert(a.sameRef(ts))) + updateCondAttr.foreach(a => assert(a.sameRef(ts))) + insertCondAttr.foreach(a => assert(a.sameRef(ss))) + + if (starInUpdate) { + assert(updateAssigns.size == 2) + assert(updateAssigns(0).key.asInstanceOf[AttributeReference].sameRef(ti)) + assert(updateAssigns(0).value.asInstanceOf[AttributeReference].sameRef(si)) + assert(updateAssigns(1).key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(updateAssigns(1).value.asInstanceOf[AttributeReference].sameRef(ss)) + } else { + assert(updateAssigns.size == 1) + assert(updateAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(updateAssigns.head.value.asInstanceOf[AttributeReference].sameRef(ss)) + } + assert(insertAssigns.size == 2) + assert(insertAssigns(0).key.asInstanceOf[AttributeReference].sameRef(ti)) + assert(insertAssigns(0).value.asInstanceOf[AttributeReference].sameRef(si)) + assert(insertAssigns(1).key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(insertAssigns(1).value.asInstanceOf[AttributeReference].sameRef(ss)) + } + Seq(("v2Table", "v2Table1"), ("testcat.tab", "testcat.tab1")).foreach { case(target, source) => // basic @@ -1065,9 +1113,25 @@ class PlanResolutionSuite extends AnalysisTest { |ON target.i = source.i |WHEN MATCHED AND (target.s='delete') THEN DELETE |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s - |WHEN NOT MATCHED AND (target.s='insert') + |WHEN NOT MATCHED AND (source.s='insert') | THEN INSERT (target.i, target.s) values (source.i, source.s) """.stripMargin + parseAndResolve(sql1) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), + updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + // star val sql2 = s""" @@ -1076,8 +1140,24 @@ class PlanResolutionSuite extends AnalysisTest { |ON target.i = source.i |WHEN MATCHED AND (target.s='delete') THEN DELETE |WHEN MATCHED AND (target.s='update') THEN UPDATE SET * - |WHEN NOT MATCHED AND (target.s='insert') THEN INSERT * + |WHEN NOT MATCHED AND (source.s='insert') THEN INSERT * """.stripMargin + parseAndResolve(sql2) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, + StringLiteral("update"))), updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns, starInUpdate = true) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + // no additional conditions val sql3 = s""" @@ -1088,6 +1168,19 @@ class PlanResolutionSuite extends AnalysisTest { |WHEN MATCHED THEN UPDATE SET target.s = source.s |WHEN NOT MATCHED THEN INSERT (target.i, target.s) values (source.i, source.s) """.stripMargin + parseAndResolve(sql3) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + mergeCondition, + Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(InsertAction(None, insertAssigns))) => + checkResolution(target, source, mergeCondition, None, None, None, + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + // using subquery val sql4 = s""" @@ -1096,9 +1189,25 @@ class PlanResolutionSuite extends AnalysisTest { |ON target.i = source.i |WHEN MATCHED AND (target.s='delete') THEN DELETE |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s - |WHEN NOT MATCHED AND (target.s='insert') + |WHEN NOT MATCHED AND (source.s='insert') | THEN INSERT (target.i, target.s) values (source.i, source.s) """.stripMargin + parseAndResolve(sql4) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), source: Project), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), + updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + // cte val sql5 = s""" @@ -1109,142 +1218,24 @@ class PlanResolutionSuite extends AnalysisTest { |ON target.i = source.i |WHEN MATCHED AND (target.s='delete') THEN DELETE |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s - |WHEN NOT MATCHED AND (target.s='insert') + |WHEN NOT MATCHED AND (source.s='insert') |THEN INSERT (target.i, target.s) values (source.i, source.s) """.stripMargin - - val parsed1 = parseAndResolve(sql1) - val parsed2 = parseAndResolve(sql2) - val parsed3 = parseAndResolve(sql3) - val parsed4 = parseAndResolve(sql4) - val parsed5 = parseAndResolve(sql5) - - parsed1 match { + parseAndResolve(sql5) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), _: DataSourceV2Relation), - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), - UpdateAction(Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), + SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), source: Project), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), updateAssigns)), - Seq(InsertAction(Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), insertAssigns))) => - assert(l.name == "target.i" && r.name == "source.i") - assert(dl.name == "target.s") - assert(ul.name == "target.s") - assert(il.name == "target.s") - assert(updateAssigns.size == 1) - assert(updateAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") - assert(updateAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") - assert(insertAssigns.size == 2) - assert(insertAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") - assert(insertAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed1.treeString) - } + assert(source.output.map(_.name) == Seq("i", "s")) + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) - parsed2 match { - case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), _: DataSourceV2Relation), - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), - UpdateAction(Some(EqualTo(ul: UnresolvedAttribute, - StringLiteral("update"))), Seq())), - Seq(InsertAction(Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), - Seq()))) => - assert(l.name == "target.i" && r.name == "source.i") - assert(dl.name == "target.s") - assert(ul.name == "target.s") - assert(il.name == "target.s") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed2.treeString) - } - - parsed3 match { - case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), _: DataSourceV2Relation), - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), - Seq(InsertAction(None, insertAssigns))) => - assert(l.name == "target.i" && r.name == "source.i") - assert(updateAssigns.size == 1) - assert(updateAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") - assert(updateAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") - assert(insertAssigns.size == 2) - assert(insertAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") - assert(insertAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed3.treeString) - } - - parsed4 match { - case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), _: Project), - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), - UpdateAction(Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), - updateAssigns)), - Seq(InsertAction(Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), - insertAssigns))) => - assert(l.name == "target.i" && r.name == "source.i") - assert(dl.name == "target.s") - assert(ul.name == "target.s") - assert(il.name == "target.s") - assert(updateAssigns.size == 1) - assert(updateAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") - assert(updateAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") - assert(insertAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") - assert(insertAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed4.treeString) - } - - parsed5 match { - case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), - UnresolvedSubqueryColumnAliases(outputColumnNames, - Project(projects, _: DataSourceV2Relation))), - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), - UpdateAction(Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), - updateAssigns)), - Seq(InsertAction(Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), - insertAssigns))) => - assert(outputColumnNames.size == 2 && - outputColumnNames.head == "i" && - outputColumnNames.last == "s") - assert(projects.size == 1 && projects.head.isInstanceOf[UnresolvedStar]) - assert(l.name == "target.i" && r.name == "source.i") - assert(dl.name == "target.s") - assert(ul.name == "target.s") - assert(il.name == "target.s") - assert(updateAssigns.size == 1) - assert(updateAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") - assert(updateAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") - assert(insertAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") - assert(insertAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed5.treeString) + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } } @@ -1255,49 +1246,94 @@ class PlanResolutionSuite extends AnalysisTest { val target = pair._1 val source = pair._2 - val sql = + val sql1 = s""" |MERGE INTO $target |USING $source - |ON $target.i = $source.i - |WHEN MATCHED AND ($target.s='delete') THEN DELETE - |WHEN MATCHED AND ($target.s='update') THEN UPDATE SET $target.s = $source.s - |WHEN NOT MATCHED AND ($target.s='insert') - |THEN INSERT ($target.i, $target.s) values ($source.i, $source.s) + |ON 1 = 1 + |WHEN MATCHED THEN DELETE + |WHEN MATCHED THEN UPDATE SET s = 1 + |WHEN NOT MATCHED AND (s = 'a') THEN INSERT (i) values (i) """.stripMargin - val parsed = parseAndResolve(sql) - - parsed match { + parseAndResolve(sql1) match { case MergeIntoTable( - _: DataSourceV2Relation, - _: DataSourceV2Relation, - EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), - Seq(DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), - UpdateAction(Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), - updateAssigns)), - Seq(InsertAction(Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), - insertAssigns))) => - assert(l.name == s"$target.i" && r.name == s"$source.i") - assert(dl.name == s"$target.s") - assert(ul.name == s"$target.s") - assert(il.name == s"$target.s") + target: DataSourceV2Relation, + source: DataSourceV2Relation, + _, + Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(InsertAction( + Some(EqualTo(il: AttributeReference, StringLiteral("a"))), + insertAssigns))) => + val ti = target.output.find(_.name == "i").get + val ts = target.output.find(_.name == "s").get + val si = source.output.find(_.name == "i").get + val ss = source.output.find(_.name == "s").get + + // INSERT condition is resolved with source table only, so column `s` is not ambiguous. + assert(il.sameRef(ss)) assert(updateAssigns.size == 1) - assert(updateAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == s"$target.s") - assert(updateAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == s"$source.s") - assert(insertAssigns.size == 2) - assert(insertAssigns.head.key.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == s"$target.i") - assert(insertAssigns.head.value.isInstanceOf[UnresolvedAttribute] && - insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == s"$source.i") - - case _ => fail("Expect MergeIntoTable, but got:\n" + parsed.treeString) + // UPDATE key is resolved with target table only, so column `s` is not ambiguous. + assert(updateAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(insertAssigns.size == 1) + // INSERT key is resolved with target table only, so column `i` is not ambiguous. + assert(insertAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ti)) + // INSERT value is resolved with source table only, so column `i` is not ambiguous. + assert(insertAssigns.head.value.asInstanceOf[AttributeReference].sameRef(si)) + + case p => fail("Expect MergeIntoTable, but got:\n" + p.treeString) } + + val sql2 = + s""" + |MERGE INTO $target + |USING $source + |ON i = 1 + |WHEN MATCHED THEN DELETE + """.stripMargin + // merge condition is resolved with both target and source tables, and we can't + // resolve column `i` as it's ambiguous. + val e2 = intercept[AnalysisException](parseAndResolve(sql2)) + assert(e2.message.contains("Reference 'i' is ambiguous")) + + val sql3 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED AND (s='delete') THEN DELETE + """.stripMargin + // delete condition is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e3 = intercept[AnalysisException](parseAndResolve(sql3)) + assert(e3.message.contains("Reference 's' is ambiguous")) + + val sql4 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED AND (s = 'a') THEN UPDATE SET i = 1 + """.stripMargin + // update condition is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e4 = intercept[AnalysisException](parseAndResolve(sql4)) + assert(e4.message.contains("Reference 's' is ambiguous")) + + val sql5 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED THEN UPDATE SET s = s + """.stripMargin + // update value is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e5 = intercept[AnalysisException](parseAndResolve(sql5)) + assert(e5.message.contains("Reference 's' is ambiguous")) } - val sql = + val sql6 = s""" |MERGE INTO non_exist_target |USING non_exist_source @@ -1306,7 +1342,7 @@ class PlanResolutionSuite extends AnalysisTest { |WHEN MATCHED THEN UPDATE SET * |WHEN NOT MATCHED THEN INSERT * """.stripMargin - val parsed = parseAndResolve(sql) + val parsed = parseAndResolve(sql6) parsed match { case u: MergeIntoTable => assert(u.targetTable.isInstanceOf[UnresolvedRelation]) From 1c46bd9e6072cdd1656288e949374440a5c9907c Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Wed, 22 Jan 2020 08:24:11 -0600 Subject: [PATCH 02/17] [SPARK-30503][ML] OnlineLDAOptimizer does not handle persistance correctly ### What changes were proposed in this pull request? unpersist graph outside checkpointer, like what Pregel does ### Why are the changes needed? Shown in [SPARK-30503](https://issues.apache.org/jira/browse/SPARK-30503), intermediate edges are not unpersisted ### Does this PR introduce any user-facing change? No ### How was this patch tested? existing testsuites and manual test Closes #27261 from zhengruifeng/lda_checkpointer. Authored-by: zhengruifeng Signed-off-by: Sean Owen --- .../org/apache/spark/mllib/clustering/LDAOptimizer.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 5eea69022562b..dc90f6c3e3885 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -142,7 +142,7 @@ final class EMLDAOptimizer extends LDAOptimizer { // For each document, create an edge (Document -> Term) for each unique term in the document. val edges: RDD[Edge[TokenCount]] = docs.flatMap { case (docID: Long, termCounts: Vector) => // Add edges for terms with non-zero counts. - termCounts.asBreeze.activeIterator.filter(_._2 != 0.0).map { case (term, cnt) => + termCounts.nonZeroIterator.map { case (term, cnt) => Edge(docID, term2index(term), cnt) } } @@ -211,11 +211,14 @@ final class EMLDAOptimizer extends LDAOptimizer { val docTopicDistributions: VertexRDD[TopicCounts] = graph.aggregateMessages[(Boolean, TopicCounts)](sendMsg, mergeMsg) .mapValues(_._2) + val prevGraph = graph // Update the vertex descriptors with the new counts. val newGraph = Graph(docTopicDistributions, graph.edges) graph = newGraph graphCheckpointer.update(newGraph) globalTopicTotals = computeGlobalTopicTotals() + prevGraph.unpersistVertices() + prevGraph.edges.unpersist() this } From 8097b7eaf3e66a956b19207cbecca3776c233767 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 22 Jan 2020 08:41:31 -0600 Subject: [PATCH 03/17] [SPARK-30573][DOC] Document WHERE Clause of SELECT statement in SQL Reference ### What changes were proposed in this pull request? Document WHERE Clause of SELECT statement in SQL Reference Guide. I ### Why are the changes needed? Currently Spark lacks documentation on the supported SQL constructs causing confusion among users who sometimes have to look at the code to understand the usage. This is aimed at addressing this issue. ### Does this PR introduce any user-facing change? Yes. **Before:** There was no documentation for this. **After** Screen Shot 2020-01-19 at 5 03 49 PM Screen Shot 2020-01-19 at 5 04 07 PM Screen Shot 2020-01-19 at 5 04 23 PM ### How was this patch tested? Tested using jykyll build --serve Closes #27282 from dilipbiswal/sql-ref-select-where. Authored-by: Dilip Biswal Signed-off-by: Sean Owen --- docs/sql-ref-syntax-qry-select-where.md | 113 ++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 docs/sql-ref-syntax-qry-select-where.md diff --git a/docs/sql-ref-syntax-qry-select-where.md b/docs/sql-ref-syntax-qry-select-where.md new file mode 100644 index 0000000000000..2c8d449172eed --- /dev/null +++ b/docs/sql-ref-syntax-qry-select-where.md @@ -0,0 +1,113 @@ +--- +layout: global +title: SELECT +displayTitle: WHERE clause +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- +The WHERE clause is used to limit the results of the FROM +clause of query or subquery based on the specified condition. + +### Syntax +{% highlight sql %} +WHERE boolean_expression +{% endhighlight %} + +### Parameters +
+
boolean_expression
+
+ Specifies any expression that evaluates to a result type boolean. Two or + more expressions may be combined together using the logical + operators ( AND, OR ). +
+
+ +### Examples +{% highlight sql %} +CREATE TABLE person (id INT, name STRING, age INT); +INSERT INTO person VALUES (100, 'John', 30), + (200, 'Mary', NULL), + (300, 'Mike', 80), + (400, 'Dan', 50); + +-- Comparison operator in `WHERE` clause. +SELECT * FROM person WHERE id > 200 ORDER BY id; + +---+----+---+ + |id |name|age| + +---+----+---+ + |300|Mike|80 | + |400|Dan |50 | + +---+----+---+ + +-- Comparison and logical operators in `WHERE` clause. +SELECT * FROM person WHERE id = 200 OR id = 300 ORDER BY id; + +---+----+----+ + |id |name|age | + +---+----+----+ + |200|Mary|null| + |300|Mike|80 | + +---+----+----+ + +-- ISNULL expression in `WHERE` clause. +SELECT * FROM person WHERE id > 300 OR age IS NULL ORDER BY id; + +---+----+----+ + |id |name|age | + +---+----+----+ + |200|Mary|null| + |400|Dan |50 | + +---+----+----+ + +-- Function expression in `WHERE` clause. +SELECT * FROM person WHERE length(name) > 3 ORDER BY id; + +---+----+----+ + |id |name|age | + +---+----+----+ + |100|John|30 | + |200|Mary|null| + |300|Mike|80 | + +---+----+----+ + +-- `BETWEEN` expression in `WHERE` clause. +SELECT * FROM person WHERE id BETWEEN 200 AND 300 ORDER BY id; + +---+----+----+ + | id|name| age| + +---+----+----+ + |200|Mary|null| + |300|Mike| 80| + +---+----+----+ + +-- Scalar Subquery in `WHERE` clause. +SELECT * FROM person WHERE age > (SELECT avg(age) FROM person); + +---+----+---+ + |id |name|age| + +---+----+---+ + |300|Mike|80 | + +---+----+---+ + +-- Correlated column reference in `WHERE` clause of subquery. +SELECT * FROM person AS parent +WHERE EXISTS ( + SELECT 1 FROM person AS child + WHERE parent.id = child.id AND child.age IS NULL + ); + +---+----+----+ + |id |name|age | + +---+----+----+ + |200|Mary|null| + +---+----+----+ + +{% endhighlight %} From a6030eff3059033155cff4ed9dac6958d186426c Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 22 Jan 2020 08:45:03 -0600 Subject: [PATCH 04/17] [SPARK-30575][DOC] Document HAVING Clause of SELECT statement in SQL Reference ### What changes were proposed in this pull request? Document HAVING clause of SELECT statement in SQL Reference Guide. ### Why are the changes needed? Currently Spark lacks documentation on the supported SQL constructs causing confusion among users who sometimes have to look at the code to understand the usage. This is aimed at addressing this issue. ### Does this PR introduce any user-facing change? Yes. **Before:** There was no documentation for this. **After.** Screen Shot 2020-01-19 at 6 03 52 PM Screen Shot 2020-01-19 at 6 04 11 PM Screen Shot 2020-01-19 at 6 04 28 PM ### How was this patch tested? Tested using jykyll build --serve Closes #27284 from dilipbiswal/sql-ref-select-having. Authored-by: Dilip Biswal Signed-off-by: Sean Owen --- docs/sql-ref-syntax-qry-select-having.md | 99 +++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/docs/sql-ref-syntax-qry-select-having.md b/docs/sql-ref-syntax-qry-select-having.md index ca92eb0d4daf0..718c22fd4b114 100644 --- a/docs/sql-ref-syntax-qry-select-having.md +++ b/docs/sql-ref-syntax-qry-select-having.md @@ -18,5 +18,102 @@ license: | See the License for the specific language governing permissions and limitations under the License. --- +The HAVING clause is used to filter the results produced by +GROUP BY based on the specified condition. It is often used +in the conjunction with a [GROUP BY](sql-ref-syntax-qry-select-groupby.html) +clause. -**This page is under construction** +### Syntax +{% highlight sql %} +HAVING boolean_expression +{% endhighlight %} + +### Parameters +
+
boolean_expression
+
+ Specifies any expression that evaluates to a result type boolean. Two or + more expressions may be combined together using the logical + operators ( AND, OR ).

+ + Note
+ The expressions specified in the HAVING clause can only refer to: +
    +
  1. Constants
  2. +
  3. Expressions that appear in GROUP BY
  4. +
  5. Aggregate functions
  6. +
+
+
+ +### Examples +{% highlight sql %} +CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); +INSERT INTO dealer VALUES (100, 'Fremont', 'Honda Civic', 10), + (100, 'Fremont', 'Honda Accord', 15), + (100, 'Fremont', 'Honda CRV', 7), + (200, 'Dublin', 'Honda Civic', 20), + (200, 'Dublin', 'Honda Accord', 10), + (200, 'Dublin', 'Honda CRV', 3), + (300, 'San Jose', 'Honda Civic', 5), + (300, 'San Jose', 'Honda Accord', 8); + +-- `HAVING` clause referring to column in `GROUP BY`. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING city = 'Fremont'; + + +-------+---+ + |city |sum| + +-------+---+ + |Fremont|32 | + +-------+---+ + +-- `HAVING` clause referring to aggregate function. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING sum(quantity) > 15; + + +-------+---+ + | city|sum| + +-------+---+ + | Dublin| 33| + |Fremont| 32| + +-------+---+ + +-- `HAVING` clause referring to aggregate function by its alias. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING sum > 15; + + +-------+---+ + | city|sum| + +-------+---+ + | Dublin| 33| + |Fremont| 32| + +-------+---+ + +-- `HAVING` clause referring to a different aggregate function than what is present in +-- `SELECT` list. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING max(quantity) > 15; + + +------+---+ + |city |sum| + +------+---+ + |Dublin|33 | + +------+---+ + +-- `HAVING` clause referring to constant expression. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING 1 > 0 ORDER BY city; + + +--------+---+ + | city|sum| + +--------+---+ + | Dublin| 33| + | Fremont| 32| + |San Jose| 13| + +--------+---+ + +-- `HAVING` clause without a `GROUP BY` clause. +SELECT sum(quantity) AS sum FROM dealer HAVING sum(quantity) > 10; + +---+ + |sum| + +---+ + | 78| + +---+ + +{% endhighlight %} From 8f7f4d57958b9896bfacce81a80e1285f9461340 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 22 Jan 2020 08:59:34 -0600 Subject: [PATCH 05/17] [SPARK-30583][DOC] Document LIMIT Clause of SELECT statement in SQL Reference ### What changes were proposed in this pull request? Document LIMIT clause of SELECT statement in SQL Reference Guide. ### Why are the changes needed? Currently Spark lacks documentation on the supported SQL constructs causing confusion among users who sometimes have to look at the code to understand the usage. This is aimed at addressing this issue. ### Does this PR introduce any user-facing change? Yes. **Before:** There was no documentation for this. **After.** Screen Shot 2020-01-20 at 1 37 28 AM Screen Shot 2020-01-20 at 1 37 43 AM ### How was this patch tested? Tested using jykyll build --serve Closes #27290 from dilipbiswal/sql-ref-select-limit. Authored-by: Dilip Biswal Signed-off-by: Sean Owen --- docs/sql-ref-syntax-qry-select-limit.md | 72 +++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-limit.md b/docs/sql-ref-syntax-qry-select-limit.md index d7fac3bb98929..609bfb98a097a 100644 --- a/docs/sql-ref-syntax-qry-select-limit.md +++ b/docs/sql-ref-syntax-qry-select-limit.md @@ -1,7 +1,7 @@ --- layout: global -title: LIMIT operator -displayTitle: LIMIT operator +title: LIMIT Clause +displayTitle: LIMIT Clause license: | Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with @@ -18,5 +18,71 @@ license: | See the License for the specific language governing permissions and limitations under the License. --- +The LIMIT clause is used to constrain the number of rows returned by the SELECT statement. +In general, this clause is used in conjuction with ORDER BY to ensure that the results are deterministic. -**This page is under construction** +### Syntax +{% highlight sql %} +LIMIT { ALL | integer_expression } +{% endhighlight %} + +### Parameters +
+
ALL
+
+ If specified, the query returns all the rows. In other words, no limit is applied if this + option is specified. +
+
integer_expression
+
+ Specifies an expression that returns an integer. +
+
+ +### Examples +{% highlight sql %} +CREATE TABLE person (name STRING, age INT); +INSERT INTO person VALUES ('Zen Hui', 25), + ('Anil B', 18), + ('Shone S', 16), + ('Mike A', 25), + ('John A', 18), + ('Jack N', 16); + +-- Select the first two rows. +SELECT name, age FROM person ORDER BY name LIMIT 2; + + +------+---+ + |name |age| + +------+---+ + |Anil B|18 | + |Jack N|16 | + +------+---+ + +-- Specifying ALL option on LIMIT returns all the rows. +SELECT name, age FROM person ORDER BY name LIMIT ALL; + + +-------+---+ + |name |age| + +-------+---+ + |Anil B |18 | + |Jack N |16 | + |John A |18 | + |Mike A |25 | + |Shone S|16 | + |Zen Hui|25 | + +-------+---+ + +-- A function expression as an input to limit. +SELECT name, age FROM person ORDER BY name LIMIT length('SPARK') + + +-------+---+ + | name|age| + +-------+---+ + | Anil B| 18| + | Jack N| 16| + | John A| 18| + | Mike A| 25| + |Shone S| 16| + +-------+---+ +{% endhighlight %} From 8e280cebf25e47bf40df224461a76fc4c84cc997 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Thu, 23 Jan 2020 00:41:46 +0800 Subject: [PATCH 06/17] [SPARK-30592][SQL] Interval support for csv and json funtions ### What changes were proposed in this pull request? In this PR, I'd propose to fully support interval for the CSV and JSON functions. On one hand, CSV and JSON records consist of string values, in the cast logic, we can cast string from/to interval now, so we can make those functions support intervals easily. Before this change we can only use this as a workaround. ```sql SELECT cast(from_csv('1, 1 day', 'a INT, b string').b as interval) struct 1 days ``` On the other hand, we ban reading or writing intervals from CSV and JSON files. To directly read and write with external json/csv storage, you still need explicit cast, e.g. ```scala spark.read.schema("a string").json("a.json").selectExpr("cast(a as interval)").show +------+ | a| +------+ |1 days| +------+ ``` ### Why are the changes needed? for interval's future-proofing purpose ### Does this PR introduce any user-facing change? yes, the `to_json`/`from_json` function can deal with intervals now. e.g. for `from_json` there is no such use case because we do not support `a interval` for `to_json`, we can use interval values now #### before ```sql SELECT to_json(map('a', interval 25 month 100 day 130 minute)); Error in query: cannot resolve 'to_json(map('a', INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))' due to data type mismatch: Unable to convert column a of type interval to JSON.; line 1 pos 7; 'Project [unresolvedalias(to_json(map(a, 2 years 1 months 100 days 2 hours 10 minutes), Some(Asia/Shanghai)), None)] +- OneRowRelation ``` #### after ```sql SELECT to_json(map('a', interval 25 month 100 day 130 minute)) {"a":"2 years 1 months 100 days 2 hours 10 minutes"} ``` ### How was this patch tested? add ut Closes #27317 from yaooqinn/SPARK-30592. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- .../main/scala/org/apache/spark/sql/Row.scala | 8 +++-- .../sql/catalyst/csv/UnivocityParser.scala | 5 +++ .../expressions/jsonExpressions.scala | 4 +-- .../sql/catalyst/json/JacksonGenerator.scala | 4 +++ .../sql/catalyst/json/JacksonParser.scala | 8 ++++- .../sql/catalyst/json/JacksonUtils.scala | 3 +- .../sql/types/CalendarIntervalType.scala | 2 +- .../expressions/CsvExpressionsSuite.scala | 12 ++++++- .../expressions/JsonExpressionsSuite.scala | 31 ++++++----------- .../resources/sql-tests/inputs/interval.sql | 6 ++++ .../sql-tests/results/ansi/interval.sql.out | 34 ++++++++++++++++++- .../sql-tests/results/interval.sql.out | 34 ++++++++++++++++++- .../apache/spark/sql/JsonFunctionsSuite.scala | 33 +++++++----------- 13 files changed, 130 insertions(+), 54 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 3f9d07520e05f..9a7e077b658df 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate} -import java.util.{Base64, TimeZone} +import java.util.Base64 import scala.collection.JavaConverters._ import scala.collection.mutable @@ -29,12 +29,13 @@ import org.json4s._ import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods._ -import org.apache.spark.annotation.{Private, Stable, Unstable} +import org.apache.spark.annotation.{Stable, Unstable} import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, Decimal, MapType, StringType, StructType, UserDefinedType} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval /** * @since 1.3.0 @@ -572,6 +573,7 @@ trait Row extends Serializable { JString(timestampFormatter.format(DateTimeUtils.instantToMicros(i))) case (t: Timestamp, _) => JString(timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t))) + case (i: CalendarInterval, _) => JString(i.toString) case (a: Array[_], ArrayType(elementType, _)) => iteratorToJsonArray(a.iterator, elementType) case (s: Seq[_], ArrayType(elementType, _)) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index 61b1b437da04c..5510953804025 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -180,6 +180,11 @@ class UnivocityParser( case _: StringType => (d: String) => nullSafeDatum(d, name, nullable, options)(UTF8String.fromString) + case CalendarIntervalType => (d: String) => + nullSafeDatum(d, name, nullable, options) { datum => + IntervalUtils.safeStringToInterval(UTF8String.fromString(datum)) + } + case udt: UserDefinedType[_] => makeConverter(name, udt.sqlType, nullable) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 43b01eb0d4e1c..61afdb6c9492f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -694,10 +694,8 @@ case class StructsToJson( TypeCheckResult.TypeCheckFailure(e.getMessage) } case map: MapType => - // TODO: let `JacksonUtils.verifySchema` verify a `MapType` try { - val st = StructType(StructField("a", map) :: Nil) - JacksonUtils.verifySchema(st) + JacksonUtils.verifyType(prettyName, map) TypeCheckResult.TypeCheckSuccess } catch { case e: UnsupportedOperationException => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala index aaf2ecf7923ce..9c63593ea1752 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala @@ -133,6 +133,10 @@ private[sql] class JacksonGenerator( val dateString = dateFormatter.format(row.getInt(ordinal)) gen.writeString(dateString) + case CalendarIntervalType => + (row: SpecializedGetters, ordinal: Int) => + gen.writeString(row.getInterval(ordinal).toString) + case BinaryType => (row: SpecializedGetters, ordinal: Int) => gen.writeBinary(row.getBinary(ordinal)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 4824b0c860cb4..c44025ca8bcfd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.util.Utils /** @@ -252,6 +252,12 @@ class JacksonParser( Decimal(bigDecimal, dt.precision, dt.scale) } + case CalendarIntervalType => (parser: JsonParser) => + parseJsonToken[CalendarInterval](parser, dataType) { + case VALUE_STRING => + IntervalUtils.safeStringToInterval(UTF8String.fromString(parser.getText)) + } + case st: StructType => val fieldConverters = st.map(_.dataType).map(makeConverter).toArray (parser: JsonParser) => parseJsonToken[InternalRow](parser, dataType) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala index 2d89c7066d080..386c7e3f7541c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala @@ -34,8 +34,7 @@ object JacksonUtils { def verifyType(name: String, dataType: DataType): Unit = { dataType match { - case NullType | BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | - DoubleType | StringType | TimestampType | DateType | BinaryType | _: DecimalType => + case NullType | _: AtomicType | CalendarIntervalType => case st: StructType => st.foreach(field => verifyType(field.name, field.dataType)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala index 5889f1ce4e1df..35ad864db0e7d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala @@ -37,7 +37,7 @@ class CalendarIntervalType private() extends DataType { override def defaultSize: Int = 16 - override def simpleString: String = "interval" + override def typeName: String = "interval" private[spark] override def asNullable: CalendarIntervalType = this } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala index 98c93a4946f4f..e623910e2efe1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with PlanTestBase { val badCsv = "\u0000\u0000\u0000A\u0001AAA" @@ -237,4 +237,14 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P timeZoneId = gmtId), expectedErrMsg = "The field for corrupt records must be string type and nullable") } + + test("from/to csv with intervals") { + val schema = new StructType().add("a", "interval") + checkEvaluation( + StructsToCsv(Map.empty, Literal.create(create_row(new CalendarInterval(1, 2, 3)), schema)), + "1 months 2 days 0.000003 seconds") + checkEvaluation( + CsvToStructs(schema, Map.empty, Literal.create("1 day")), + InternalRow(new CalendarInterval(0, 1, 0))) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index d5cc1d4f0fdde..3693531f47610 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -25,12 +25,10 @@ import org.scalatest.exceptions.TestFailedException import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with PlanTestBase { val json = @@ -680,25 +678,18 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with output) } - test("to_json: verify MapType's value type instead of key type") { - // Keys in map are treated as strings when converting to JSON. The type doesn't matter at all. - val mapType1 = MapType(CalendarIntervalType, IntegerType) - val schema1 = StructType(StructField("a", mapType1) :: Nil) - val struct1 = Literal.create(null, schema1) + test("from/to json - interval support") { + val schema = StructType(StructField("i", CalendarIntervalType) :: Nil) checkEvaluation( - StructsToJson(Map.empty, struct1, gmtId), - null - ) + JsonToStructs(schema, Map.empty, Literal.create("""{"i":"1 year 1 day"}""", StringType)), + InternalRow(new CalendarInterval(12, 1, 0))) - // The value type must be valid for converting to JSON. - val mapType2 = MapType(IntegerType, CalendarIntervalType) - val schema2 = StructType(StructField("a", mapType2) :: Nil) - val struct2 = Literal.create(null, schema2) - StructsToJson(Map.empty, struct2, gmtId).checkInputDataTypes() match { - case TypeCheckResult.TypeCheckFailure(msg) => - assert(msg.contains("Unable to convert column a of type interval to JSON")) - case _ => fail("from_json should not work on interval map value type.") - } + Seq(MapType(CalendarIntervalType, IntegerType), MapType(IntegerType, CalendarIntervalType)) + .foreach { dt => + val schema = StructField("a", dt) :: Nil + val struct = Literal.create(null, StructType(schema)) + assert(StructsToJson(Map.empty, struct).checkInputDataTypes().isSuccess) + } } test("from_json missing fields") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 5de5656cbec56..fb6c485f619ae 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -220,3 +220,9 @@ select a - b from values (interval '-2147483648 months', interval '2147483647 mo select b + interval '1 month' from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); + +-- interval support for csv and json functions +SELECT from_csv('1, 1 day', 'a INT, b interval'); +SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)); +SELECT from_json('{"a":"1 days"}', 'a interval'); +SELECT to_json(map('a', interval 25 month 100 day 130 minute)); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index 2893b6a01d1eb..64107f98ede6c 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 97 +-- Number of queries: 101 -- !query 0 @@ -985,3 +985,35 @@ struct<> -- !query 96 output java.lang.ArithmeticException integer overflow + + +-- !query 97 +SELECT from_csv('1, 1 day', 'a INT, b interval') +-- !query 97 schema +struct> +-- !query 97 output +{"a":1,"b":1 days} + + +-- !query 98 +SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)) +-- !query 98 schema +struct +-- !query 98 output +2 years 8 months,1 hours 10 minutes + + +-- !query 99 +SELECT from_json('{"a":"1 days"}', 'a interval') +-- !query 99 schema +struct> +-- !query 99 output +{"a":1 days} + + +-- !query 100 +SELECT to_json(map('a', interval 25 month 100 day 130 minute)) +-- !query 100 schema +struct +-- !query 100 output +{"a":"2 years 1 months 100 days 2 hours 10 minutes"} diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out index 633ee6fccc98f..d494399cedddc 100644 --- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 97 +-- Number of queries: 101 -- !query 0 @@ -966,3 +966,35 @@ struct<> -- !query 96 output java.lang.ArithmeticException integer overflow + + +-- !query 97 +SELECT from_csv('1, 1 day', 'a INT, b interval') +-- !query 97 schema +struct> +-- !query 97 output +{"a":1,"b":1 days} + + +-- !query 98 +SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)) +-- !query 98 schema +struct +-- !query 98 output +2 years 8 months,1 hours 10 minutes + + +-- !query 99 +SELECT from_json('{"a":"1 days"}', 'a interval') +-- !query 99 schema +struct> +-- !query 99 output +{"a":1 days} + + +-- !query 100 +SELECT to_json(map('a', interval 25 month 100 day 130 minute)) +-- !query 100 schema +struct +-- !query 100 output +{"a":"2 years 1 months 100 days 2 hours 10 minutes"} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 6f55676f318d5..fd1e9e309558e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -222,33 +222,24 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { Row("""{"_1":"26/08/2015 18:00"}""") :: Nil) } - test("to_json - key types of map don't matter") { - // interval type is invalid for converting to JSON. However, the keys of a map are treated - // as strings, so its type doesn't matter. - val df = Seq(Tuple1(Tuple1("-3 month 7 hours"))).toDF("a") - .select(struct(map($"a._1".cast(CalendarIntervalType), lit("a")).as("col1")).as("c")) + test("to_json - interval support") { + val baseDf = Seq(Tuple1(Tuple1("-3 month 7 hours"))).toDF("a") + val df = baseDf.select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c")) checkAnswer( df.select(to_json($"c")), - Row("""{"col1":{"-3 months 7 hours":"a"}}""") :: Nil) - } + Row("""{"a":"-3 months 7 hours"}""") :: Nil) - test("to_json unsupported type") { - val baseDf = Seq(Tuple1(Tuple1("-3 month 7 hours"))).toDF("a") - val df = baseDf.select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c")) - val e = intercept[AnalysisException]{ - // Unsupported type throws an exception - df.select(to_json($"c")).collect() - } - assert(e.getMessage.contains( - "Unable to convert column a of type interval to JSON.")) + val df1 = baseDf + .select(struct(map($"a._1".cast(CalendarIntervalType), lit("a")).as("col1")).as("c")) + checkAnswer( + df1.select(to_json($"c")), + Row("""{"col1":{"-3 months 7 hours":"a"}}""") :: Nil) - // interval type is invalid for converting to JSON. We can't use it as value type of a map. val df2 = baseDf .select(struct(map(lit("a"), $"a._1".cast(CalendarIntervalType)).as("col1")).as("c")) - val e2 = intercept[AnalysisException] { - df2.select(to_json($"c")).collect() - } - assert(e2.getMessage.contains("Unable to convert column col1 of type interval to JSON")) + checkAnswer( + df2.select(to_json($"c")), + Row("""{"col1":{"a":"-3 months 7 hours"}}""") :: Nil) } test("roundtrip in to_json and from_json - struct") { From 6dfaa0783f7779972752cac48fabbb321811f3c0 Mon Sep 17 00:00:00 2001 From: jiake Date: Wed, 22 Jan 2020 09:02:34 -0800 Subject: [PATCH 07/17] [SPARK-30549][SQL] Fix the subquery shown issue in UI When enable AQE ### What changes were proposed in this pull request? After [PR#25316](https://github.com/apache/spark/pull/25316) fixed the dead lock issue in [PR#25308](https://github.com/apache/spark/pull/25308), the subquery metrics can not be shown in UI as following screenshot. ![image](https://user-images.githubusercontent.com/11972570/72891385-160ec980-3d4f-11ea-91fc-ccaad890f7dc.png) This PR fix the subquery UI shown issue by adding `SparkListenerSQLAdaptiveSQLMetricUpdates` event to update the suquery sql metric. After with this PR, the suquery UI can show correctly as following screenshot: ![image](https://user-images.githubusercontent.com/11972570/72893610-66d4f100-3d54-11ea-93c9-f444b2f31952.png) ### Why are the changes needed? Showing the subquery metric in UI when enable AQE ### Does this PR introduce any user-facing change? No ### How was this patch tested? Existing UT Closes #27260 from JkSelf/fixSubqueryUI. Authored-by: jiake Signed-off-by: Xiao Li --- .../adaptive/AdaptiveSparkPlanExec.scala | 51 +++++++++++++++---- .../execution/ui/SQLAppStatusListener.scala | 9 ++++ .../spark/sql/execution/ui/SQLListener.scala | 6 +++ 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 56ea12918448c..3f20b59361988 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -37,7 +37,8 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ThreadUtils @@ -132,16 +133,27 @@ case class AdaptiveSparkPlanExec( executedPlan.resetMetrics() } + private def collectSQLMetrics(plan: SparkPlan): Seq[SQLMetric] = { + val metrics = new mutable.ArrayBuffer[SQLMetric]() + plan.foreach { + case p: ShuffleQueryStageExec if (p.resultOption.isEmpty) => + collectSQLMetrics(p.plan).foreach(metrics += _) + case p: BroadcastQueryStageExec if (p.resultOption.isEmpty) => + collectSQLMetrics(p.plan).foreach(metrics += _) + case p: SparkPlan => + p.metrics.foreach { case metric => + metrics += metric._2 + } + } + metrics + } + private def getFinalPhysicalPlan(): SparkPlan = lock.synchronized { if (!isFinalPlan) { // Subqueries do not have their own execution IDs and therefore rely on the main query to // update UI. - val executionId = if (isSubquery) { - None - } else { - Option(context.session.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)) - .map(_.toLong) - } + val executionId = Option(context.session.sparkContext.getLocalProperty( + SQLExecution.EXECUTION_ID_KEY)).map(_.toLong) var currentLogicalPlan = currentPhysicalPlan.logicalLink.get var result = createQueryStages(currentPhysicalPlan) val events = new LinkedBlockingQueue[StageMaterializationEvent]() @@ -484,10 +496,27 @@ case class AdaptiveSparkPlanExec( * Notify the listeners of the physical plan change. */ private def onUpdatePlan(executionId: Long): Unit = { - context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( - executionId, - SQLExecution.getQueryExecution(executionId).toString, - SparkPlanInfo.fromSparkPlan(this))) + if (isSubquery) { + // When executing subqueries, we can't update the query plan in the UI as the + // UI doesn't support partial update yet. However, the subquery may have been + // optimized into a different plan and we must let the UI know the SQL metrics + // of the new plan nodes, so that it can track the valid accumulator updates later + // and display SQL metrics correctly. + onUpdateSQLMetrics(collectSQLMetrics(currentPhysicalPlan), executionId) + } else { + context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( + executionId, + SQLExecution.getQueryExecution(executionId).toString, + SparkPlanInfo.fromSparkPlan(this))) + } + } + + private def onUpdateSQLMetrics(sqlMetrics: Seq[SQLMetric], executionId: Long): Unit = { + val sqlPlanMetrics = sqlMetrics.map { case sqlMetric => + SQLPlanMetric(sqlMetric.name.get, sqlMetric.id, sqlMetric.metricType) + } + context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveSQLMetricUpdates( + executionId.toLong, sqlPlanMetrics)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index d5bb36e8a0c95..1454cc05ed4da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -344,6 +344,14 @@ class SQLAppStatusListener( update(exec) } + private def onAdaptiveSQLMetricUpdate(event: SparkListenerSQLAdaptiveSQLMetricUpdates): Unit = { + val SparkListenerSQLAdaptiveSQLMetricUpdates(executionId, sqlPlanMetrics) = event + + val exec = getOrCreateExecution(executionId) + exec.metrics = exec.metrics ++ sqlPlanMetrics + update(exec) + } + private def onExecutionEnd(event: SparkListenerSQLExecutionEnd): Unit = { val SparkListenerSQLExecutionEnd(executionId, time) = event Option(liveExecutions.get(executionId)).foreach { exec => @@ -383,6 +391,7 @@ class SQLAppStatusListener( override def onOtherEvent(event: SparkListenerEvent): Unit = event match { case e: SparkListenerSQLExecutionStart => onExecutionStart(e) case e: SparkListenerSQLAdaptiveExecutionUpdate => onAdaptiveExecutionUpdate(e) + case e: SparkListenerSQLAdaptiveSQLMetricUpdates => onAdaptiveSQLMetricUpdate(e) case e: SparkListenerSQLExecutionEnd => onExecutionEnd(e) case e: SparkListenerDriverAccumUpdates => onDriverAccumUpdates(e) case _ => // Ignore diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala index 81cbc7f54c7eb..6a6a71c46f213 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala @@ -34,6 +34,12 @@ case class SparkListenerSQLAdaptiveExecutionUpdate( sparkPlanInfo: SparkPlanInfo) extends SparkListenerEvent +@DeveloperApi +case class SparkListenerSQLAdaptiveSQLMetricUpdates( + executionId: Long, + sqlPlanMetrics: Seq[SQLPlanMetric]) + extends SparkListenerEvent + @DeveloperApi case class SparkListenerSQLExecutionStart( executionId: Long, From 84f11548e428abc28617218e4405a159d44c0eac Mon Sep 17 00:00:00 2001 From: Udbhav30 Date: Wed, 22 Jan 2020 14:20:28 -0800 Subject: [PATCH 08/17] [SPARK-30604][CORE] Fix a log message by including hostLocalBlockBytes to total bytes ### What changes were proposed in this pull request? Add HostLocalBlock size in log total bytes ### Why are the changes needed? total size in log is wrong as hostlocal block size is missed ### Does this PR introduce any user-facing change? no ### How was this patch tested? Manually checking the log Closes #27320 from Udbhav30/bug. Authored-by: Udbhav30 Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/storage/ShuffleBlockFetcherIterator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index 97e0600da4a99..cd4c86006af5a 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -317,7 +317,7 @@ final class ShuffleBlockFetcherIterator( collectFetchRequests(address, blockInfos, collectedRemoteRequests) } } - val totalBytes = localBlockBytes + remoteBlockBytes + val totalBytes = localBlockBytes + remoteBlockBytes + hostLocalBlockBytes logInfo(s"Getting $numBlocksToFetch (${Utils.bytesToString(totalBytes)}) non-empty blocks " + s"including ${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local and " + s"${hostLocalBlocks.size} (${Utils.bytesToString(hostLocalBlockBytes)}) " + From 4ca31b470f47f5cefd603778852e828420a89456 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 22 Jan 2020 15:40:24 -0800 Subject: [PATCH 09/17] [SPARK-30606][SQL] Fix the `like` function with 2 parameters ### What changes were proposed in this pull request? In the PR, I propose to add additional constructor in the `Like` expression. The constructor can be used on applying the `like` function with 2 parameters. ### Why are the changes needed? `FunctionRegistry` cannot find a constructor if the `like` function is applied to 2 parameters. ### Does this PR introduce any user-facing change? Yes, before: ```sql spark-sql> SELECT like('Spark', '_park'); Invalid arguments for function like; line 1 pos 7 org.apache.spark.sql.AnalysisException: Invalid arguments for function like; line 1 pos 7 at org.apache.spark.sql.catalyst.analysis.FunctionRegistry$.$anonfun$expression$7(FunctionRegistry.scala:618) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.catalyst.analysis.FunctionRegistry$.$anonfun$expression$4(FunctionRegistry.scala:602) at org.apache.spark.sql.catalyst.analysis.SimpleFunctionRegistry.lookupFunction(FunctionRegistry.scala:121) at org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupFunction(SessionCatalog.scala:1412) ``` After: ```sql spark-sql> SELECT like('Spark', '_park'); true ``` ### How was this patch tested? By running `check outputs of expression examples` from `SQLQuerySuite`. Closes #27323 from MaxGekk/fix-like-func. Authored-by: Maxim Gekk Signed-off-by: Dongjoon Hyun --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 6 +++++- .../spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala | 4 ++-- sql/core/src/main/scala/org/apache/spark/sql/Column.scala | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index c6fb6c95e5d31..2354087768615 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -95,6 +95,8 @@ abstract class StringRegexExpression extends BinaryExpression """, examples = """ Examples: + > SELECT _FUNC_('Spark', '_park'); + true > SET spark.sql.parser.escapedStringLiterals=true; spark.sql.parser.escapedStringLiterals true > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'; @@ -111,9 +113,11 @@ abstract class StringRegexExpression extends BinaryExpression """, since = "1.0.0") // scalastyle:on line.contains.tab -case class Like(left: Expression, right: Expression, escapeChar: Char = '\\') +case class Like(left: Expression, right: Expression, escapeChar: Char) extends StringRegexExpression { + def this(left: Expression, right: Expression) = this(left, right, '\\') + override def escape(v: String): String = StringUtils.escapeLikeRegex(v, escapeChar) override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index 641c89873dcc4..23ab6b2df3e64 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -196,8 +196,8 @@ class ConstantFoldingSuite extends PlanTest { EqualTo(Literal.create(null, IntegerType), 1) as 'c11, EqualTo(1, Literal.create(null, IntegerType)) as 'c12, - Like(Literal.create(null, StringType), "abc") as 'c13, - Like("abc", Literal.create(null, StringType)) as 'c14, + new Like(Literal.create(null, StringType), "abc") as 'c13, + new Like("abc", Literal.create(null, StringType)) as 'c14, Upper(Literal.create(null, StringType)) as 'c15, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index ed10843b08596..8bd5835fd931b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -841,7 +841,7 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 1.3.0 */ - def like(literal: String): Column = withExpr { Like(expr, lit(literal).expr) } + def like(literal: String): Column = withExpr { new Like(expr, lit(literal).expr) } /** * SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex From 2e74dba3d065d40a4e487c9135ec3c4c99d3d50a Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 22 Jan 2020 18:30:42 -0600 Subject: [PATCH 10/17] [SPARK-30574][DOC] Document GROUP BY Clause of SELECT statement in SQL Reference ### What changes were proposed in this pull request? Document GROUP BY clause of SELECT statement in SQL Reference Guide. ### Why are the changes needed? Currently Spark lacks documentation on the supported SQL constructs causing confusion among users who sometimes have to look at the code to understand the usage. This is aimed at addressing this issue. ### Does this PR introduce any user-facing change? Yes. **Before:** There was no documentation for this. **After.** Screen Shot 2020-01-19 at 5 11 12 PM Screen Shot 2020-01-19 at 5 11 32 PM Screen Shot 2020-01-19 at 5 11 49 PM Screen Shot 2020-01-19 at 5 12 05 PM Screen Shot 2020-01-19 at 5 12 31 PM Screen Shot 2020-01-19 at 5 26 38 PM ### How was this patch tested? Tested using jykyll build --serve Closes #27283 from dilipbiswal/sql-ref-select-groupby. Authored-by: Dilip Biswal Signed-off-by: Sean Owen --- docs/sql-ref-syntax-qry-select-groupby.md | 209 +++++++++++++++++++++- 1 file changed, 206 insertions(+), 3 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md index 8ba7d155f3716..2d13c349561a8 100644 --- a/docs/sql-ref-syntax-qry-select-groupby.md +++ b/docs/sql-ref-syntax-qry-select-groupby.md @@ -1,7 +1,7 @@ --- layout: global -title: GROUPBY Clause -displayTitle: GROUPBY Clause +title: GROUP BY Clause +displayTitle: GROUP BY Clause license: | Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with @@ -18,5 +18,208 @@ license: | See the License for the specific language governing permissions and limitations under the License. --- +The GROUP BY clause is used to group the rows based on a set of specified grouping expressions and compute aggregations on +the group of rows based on one or more specified aggregate functions. Spark also supports advanced aggregations to do multiple +aggregations for the same input record set via `GROUPING SETS`, `CUBE`, `ROLLUP` clauses. -**This page is under construction** +### Syntax +{% highlight sql %} +GROUP BY [ GROUPING SETS grouping_sets ] group_expression [ , group_expression [ , ... ] ] + [ ( WITH ROLLUP | WITH CUBE | GROUPING SETS grouping_sets ) ) ] +{% endhighlight %} + +### Parameters +
+
GROUPING SETS
+
+ Groups the rows for each subset of the expressions specified in the grouping sets. For example, + GROUP BY GROUPING SETS (warehouse, product) is semantically equivalent + to union of results of GROUP BY warehouse and GROUP BY product. This clause + is shorthand for a UNION ALL where each leg of the UNION ALL + operator performs aggregation of subset of the columns specified in the GROUPING SETS clause. +
+
grouping_sets
+
+ Specifies one of more groupings based on which the GROUP BY clause performs aggregations. A grouping + set is specified by a list of comma-separated expressions in parentheses.

+ Syntax: + + (() | (expression [ , ...])) + +
+
grouping_expression
+
+ Specifies the critieria based on which the rows are grouped together. The grouping of rows is performed based on + result values of the grouping expressions. A grouping expression may be a column alias, a column position + or an expression. +
+
ROLLUP
+
+ Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations + based on multiple grouping sets. ROLLUP is shorthand for GROUPING SETS. For example, + GROUP BY warehouse, product WITH ROLLUP is equivalent to GROUP BY warehouse, product GROUPING SETS + ((warehouse, product), (warehouse), ()). + The N elements of a ROLLUP specification results in N+1 GROUPING SETS. +
+
CUBE
+
+ CUBE clause is used to perform aggregations based on combination of grouping columns specified in the + GROUP BY clause. For example, GROUP BY warehouse, product WITH CUBE is equivalent + to GROUP BY warehouse, product GROUPING SETS ((warehouse, product), (warehouse), (product), ()). + The N elements of a CUBE specification results in 2^N GROUPING SETS. +
+
+ +### Examples +{% highlight sql %} +CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); +INSERT INTO dealer VALUES (100, 'Fremont', 'Honda Civic', 10), + (100, 'Fremont', 'Honda Accord', 15), + (100, 'Fremont', 'Honda CRV', 7), + (200, 'Dublin', 'Honda Civic', 20), + (200, 'Dublin', 'Honda Accord', 10), + (200, 'Dublin', 'Honda CRV', 3), + (300, 'San Jose', 'Honda Civic', 5), + (300, 'San Jose', 'Honda Accord', 8); + +-- Sum of quantity per dealership. Group by `id`. +SELECT id, sum(quantity) FROM dealer GROUP BY id ORDER BY id; + + +---+-------------+ + |id |sum(quantity)| + +---+-------------+ + |100|32 | + |200|33 | + |300|13 | + +---+-------------+ + +-- Use column position in GROUP by clause. +SELECT id, sum(quantity) FROM dealer GROUP BY 1 ORDER BY 1; + + +---+-------------+ + |id |sum(quantity)| + +---+-------------+ + |100|32 | + |200|33 | + |300|13 | + +---+-------------+ + +-- Multiple aggregations. +-- 1. Sum of quantity per dealership. +-- 2. Max quantity per dealership. +SELECT id, sum(quantity) AS sum, max(quantity) AS max FROM dealer GROUP BY id ORDER BY id; + + +---+---+---+ + |id |sum|max| + +---+---+---+ + |100|32 |15 | + |200|33 |20 | + |300|13 |8 | + +---+---+---+ + +-- Aggregations using multiple sets of grouping columns in a single statement. +-- Following performs aggregations based on four sets of grouping columns. +-- 1. city, car_model +-- 2. city +-- 3. car_model +-- 4. Empty grouping set. Returns quantities for all city and car models. +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) + ORDER BY city; + + +--------+------------+---+ + |city |car_model |sum| + +--------+------------+---+ + |null |null |78 | + |null |Honda Accord|33 | + |null |Honda CRV |10 | + |null |Honda Civic |35 | + |Dublin |null |33 | + |Dublin |Honda Accord|10 | + |Dublin |Honda CRV |3 | + |Dublin |Honda Civic |20 | + |Fremont |null |32 | + |Fremont |Honda Accord|15 | + |Fremont |Honda CRV |7 | + |Fremont |Honda Civic |10 | + |San Jose|null |13 | + |San Jose|Honda Accord|8 | + |San Jose|Honda Civic |5 | + +--------+------------+---+ + +-- Alternate syntax for `GROUPING SETS` in which both `GROUP BY` and `GROUPING SETS` +-- specifications are present. +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY city, car_model GROUPING SETS ((city, car_model), (city), (car_model), ()) + ORDER BY city, car_model; + + +--------+------------+---+ + |city |car_model |sum| + +--------+------------+---+ + |null |null |78 | + |null |Honda Accord|33 | + |null |Honda CRV |10 | + |null |Honda Civic |35 | + |Dublin |null |33 | + |Dublin |Honda Accord|10 | + |Dublin |Honda CRV |3 | + |Dublin |Honda Civic |20 | + |Fremont |null |32 | + |Fremont |Honda Accord|15 | + |Fremont |Honda CRV |7 | + |Fremont |Honda Civic |10 | + |San Jose|null |13 | + |San Jose|Honda Accord|8 | + |San Jose|Honda Civic |5 | + +--------+------------+---+ + +-- Group by processing with `ROLLUP` clause. +-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), ()) +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY city, car_model WITH ROLLUP + ORDER BY city, car_model; + + +--------+------------+---+ + |city |car_model |sum| + +--------+------------+---+ + |null |null |78 | + |Dublin |null |33 | + |Dublin |Honda Accord|10 | + |Dublin |Honda CRV |3 | + |Dublin |Honda Civic |20 | + |Fremont |null |32 | + |Fremont |Honda Accord|15 | + |Fremont |Honda CRV |7 | + |Fremont |Honda Civic |10 | + |San Jose|null |13 | + |San Jose|Honda Accord|8 | + |San Jose|Honda Civic |5 | + +--------+------------+---+ + +-- Group by processing with `CUBE` clause. +-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY city, car_model WITH CUBE + ORDER BY city, car_model; + + +--------+------------+---+ + |city |car_model |sum| + +--------+------------+---+ + |null |null |78 | + |null |Honda Accord|33 | + |null |Honda CRV |10 | + |null |Honda Civic |35 | + |Dublin |null |33 | + |Dublin |Honda Accord|10 | + |Dublin |Honda CRV |3 | + |Dublin |Honda Civic |20 | + |Fremont |null |32 | + |Fremont |Honda Accord|15 | + |Fremont |Honda CRV |7 | + |Fremont |Honda Civic |10 | + |San Jose|null |13 | + |San Jose|Honda Accord|8 | + |San Jose|Honda Civic |5 | + +--------+------------+---+ + +{% endhighlight %} From 38f4e599b385a54bd5a8866585927ded7caf3939 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 22 Jan 2020 18:46:28 -0600 Subject: [PATCH 11/17] [SPARK-28801][DOC] Document SELECT statement in SQL Reference (Main page) ### What changes were proposed in this pull request? Document SELECT statement in SQL Reference Guide. In this PR includes the main entry page for SELECT. I will open follow-up PRs for different clauses. ### Why are the changes needed? Currently Spark lacks documentation on the supported SQL constructs causing confusion among users who sometimes have to look at the code to understand the usage. This is aimed at addressing this issue. ### Does this PR introduce any user-facing change? Yes. **Before:** There was no documentation for this. **After.** Screen Shot 2020-01-19 at 11 20 41 PM Screen Shot 2020-01-19 at 11 21 55 PM Screen Shot 2020-01-19 at 11 22 16 PM ### How was this patch tested? Tested using jykyll build --serve Closes #27216 from dilipbiswal/sql_ref_select_hook. Authored-by: Dilip Biswal Signed-off-by: Sean Owen --- docs/sql-ref-syntax-qry-select.md | 119 +++++++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index 41972ef070831..05feda5f9a5dd 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -18,8 +18,119 @@ license: | See the License for the specific language governing permissions and limitations under the License. --- +Spark supports a `SELECT` statement and conforms to the ANSI SQL standard. Queries are +used to retrieve result sets from one or more tables. The following section +describes the overall query syntax and the sub-sections cover different constructs +of a query along with examples. -Spark SQL is a Apache Spark's module for working with structured data. -This guide is a reference for Structured Query Language (SQL) for Apache -Spark. This document describes the SQL constructs supported by Spark in detail -along with usage examples when applicable. +### Syntax +{% highlight sql %} +[ WITH with_query [ , ... ] ] +SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] } + FROM { from_item [ , ...] } + [ WHERE boolean_expression ] + [ GROUP BY expression [ , ...] ] + [ HAVING boolean_expression ] + [ ORDER BY { expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ...] } ] + [ SORT BY { expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ...] } ] + [ CLUSTER BY { expression [ , ...] } ] + [ DISTRIBUTE BY { expression [, ...] } ] + { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] + [ WINDOW { named_window [ , WINDOW named_window, ... ] } ] + [ LIMIT { ALL | expression } ] +{% endhighlight %} + +### Parameters +
+
with_query
+
+ Specifies the common table expressions (CTEs) before the main SELECT query block. + These table expressions are allowed to be referenced later in the main query. This is useful to abstract + out repeated subquery blocks in the main query and improves readability of the query. +
+
hints
+
+ Hints can be specified to help spark optimizer make better planning decisions. Currently spark supports hints + that influence selection of join strategies and repartitioning of the data. +
+
ALL
+
+ Select all matching rows from the relation and is enabled by default. +
+
DISTINCT
+
+ Select all matching rows from the relation after removing duplicates in results. +
+
named_expression
+
+ An expression with an assigned name. In general, it denotes a column expression.

+ Syntax: + + expression [AS] [alias] + +
+
from_item
+
+ Specifies a source of input for the query. It can be one of the following: +
    +
  1. Table relation
  2. +
  3. Join relation
  4. +
  5. Table valued function
  6. +
  7. Inlined table
  8. +
  9. Subquery
  10. +
+
+
WHERE
+
+ Filters the result of the FROM clause based on the supplied predicates. +
+
GROUP BY
+
+ Specifies the expressions that are used to group the rows. This is used in conjunction with aggregate functions + (MIN, MAX, COUNT, SUM, AVG) to group rows based on the grouping expressions. +
+
HAVING
+
+ Specifies the predicates by which the rows produced by GROUP BY are filtered. The HAVING clause is used to + filter rows after the grouping is performed. +
+
ORDER BY
+
+ Specifies an ordering of the rows of the complete result set of the query. The output rows are ordered + across the partitions. This parameter is mutually exclusive with SORT BY, + CLUSTER BY and DISTRIBUTE BY and can not be specified together. +
+
SORT BY
+
+ Specifies an ordering by which the rows are ordered within each partition. This parameter is mutually + exclusive with ORDER BY and CLUSTER BY and can not be specified together. +
+
CLUSTER BY
+
+ Specifies a set of expressions that is used to repartition and sort the rows. Using this clause has + the same effect of using DISTRIBUTE BY and SORT BY together. +
+
DISTRIBUTE BY
+
+ Specifies a set of expressions by which the result rows are repartitioned. This parameter is mutually + exclusive with ORDER BY and CLUSTER BY and can not be specified together. +
+
LIMIT
+
+ Specifies the maximum number of rows that can be returned by a statement or subquery. This clause + is mostly used in the conjunction with ORDER BY to produce a deterministic result. +
+
boolean_expression
+
+ Specifies an expression with a return type of boolean. +
+
expression
+
+ Specifies a combination of one or more values, operators, and SQL functions that evaluates to a value. +
+
named_window
+
+ Specifies aliases for one or more source window specifications. The source window specifications can + be referenced in the widow definitions in the query. +
+
From eccae13a5faf93a524754e4cfcf71cbe8f3ad4e6 Mon Sep 17 00:00:00 2001 From: Enrico Minack Date: Wed, 22 Jan 2020 19:51:08 -0600 Subject: [PATCH 12/17] [SPARK-30531][WEB UI] Do not render plan viz when it exists already ### What changes were proposed in this pull request? When you save a Spark UI SQL query page to disk and then display the html file with your browser, the query plan will be rendered a second time. This change avoids rendering the plan visualization when it exists already. This is master: ![grafik](https://user-images.githubusercontent.com/44700269/72543429-fcb8d980-3885-11ea-82aa-c0b3638847e5.png) And with the fix: ![grafik](https://user-images.githubusercontent.com/44700269/72543641-57523580-3886-11ea-8cdf-5fb0cdffa983.png) ### Why are the changes needed? The duplicate query plan is unexpected and redundant. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manually tested. Testing this in a reproducible way requires a running browser or HTML rendering engine that executes the JavaScript. Closes #27238 from EnricoMi/branch-sql-ui-duplicate-plan. Authored-by: Enrico Minack Signed-off-by: Sean Owen --- .../org/apache/spark/sql/execution/ui/static/spark-sql-viz.js | 4 ++++ .../org/apache/spark/sql/execution/ui/ExecutionPage.scala | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js index 2329fd262ddfb..754711bd5ad85 100644 --- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js +++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js @@ -20,6 +20,10 @@ var PlanVizConstants = { svgMarginY: 16 }; +function shouldRenderPlanViz() { + return planVizContainer().selectAll("svg").empty(); +} + function renderPlanViz() { var svg = planVizContainer().append("svg"); var metadata = d3.select("#plan-viz-metadata"); diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala index 875086cda258d..91360e0e50314 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala @@ -116,7 +116,7 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging {metadata} {planVisualizationResources(request)} - + } From bbab2bb961a8d0c8fd57a2899c94b00c747e8444 Mon Sep 17 00:00:00 2001 From: Ajith Date: Wed, 22 Jan 2020 18:21:11 -0800 Subject: [PATCH 13/17] [SPARK-30556][SQL] Copy sparkContext.localproperties to child thread inSubqueryExec.executionContext ### What changes were proposed in this pull request? In `org.apache.spark.sql.execution.SubqueryExec#relationFuture` make a copy of `org.apache.spark.SparkContext#localProperties` and pass it to the sub-execution thread in `org.apache.spark.sql.execution.SubqueryExec#executionContext` ### Why are the changes needed? Local properties set via sparkContext are not available as TaskContext properties when executing jobs and threadpools have idle threads which are reused Explanation: When `SubqueryExec`, the relationFuture is evaluated via a separate thread. The threads inherit the `localProperties` from `sparkContext` as they are the child threads. These threads are created in the `executionContext` (thread pools). Each Thread pool has a default keepAliveSeconds of 60 seconds for idle threads. Scenarios where the thread pool has threads which are idle and reused for a subsequent new query, the thread local properties will not be inherited from spark context (thread properties are inherited only on thread creation) hence end up having old or no properties set. This will cause taskset properties to be missing when properties are transferred by child thread via `sparkContext.runJob/submitJob` ### Does this PR introduce any user-facing change? No ### How was this patch tested? Added UT Closes #27267 from ajithme/subquerylocalprop. Authored-by: Ajith Signed-off-by: Dongjoon Hyun --- .../spark/sql/internal/StaticSQLConf.scala | 10 +++++- .../spark/sql/execution/SQLExecution.scala | 18 ++++++++++ .../execution/basicPhysicalOperators.scala | 12 ++++--- .../internal/ExecutorSideSQLConfSuite.scala | 36 +++++++++++++++++-- 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala index d2f27da239016..66ac9ddb21aaa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala @@ -145,9 +145,17 @@ object StaticSQLConf { "cause longer waiting for other broadcasting. Also, increasing parallelism may " + "cause memory problem.") .intConf - .checkValue(thres => thres > 0 && thres <= 128, "The threshold must be in [0,128].") + .checkValue(thres => thres > 0 && thres <= 128, "The threshold must be in (0,128].") .createWithDefault(128) + val SUBQUERY_MAX_THREAD_THRESHOLD = + buildStaticConf("spark.sql.subquery.maxThreadThreshold") + .internal() + .doc("The maximum degree of parallelism to execute the subquery.") + .intConf + .checkValue(thres => thres > 0 && thres <= 128, "The threshold must be in (0,128].") + .createWithDefault(16) + val SQL_EVENT_TRUNCATE_LENGTH = buildStaticConf("spark.sql.event.truncate.length") .doc("Threshold of SQL length beyond which it will be truncated before adding to " + "event. Defaults to no truncation. If set to 0, callsite will be logged instead.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala index 6046805ae95d4..995d94ef5eac7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong +import scala.concurrent.{ExecutionContext, Future} + import org.apache.spark.SparkContext import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.SparkSession @@ -164,4 +166,20 @@ object SQLExecution { } } } + + /** + * Wrap passed function to ensure necessary thread-local variables like + * SparkContext local properties are forwarded to execution thread + */ + def withThreadLocalCaptured[T]( + sparkSession: SparkSession, exec: ExecutionContext)(body: => T): Future[T] = { + val activeSession = sparkSession + val sc = sparkSession.sparkContext + val localProps = Utils.cloneProperties(sc.getLocalProperties) + Future { + SparkSession.setActiveSession(activeSession) + sc.setLocalProperties(localProps) + body + }(exec) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index e128d59dca6ba..f3f756425a153 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -31,8 +31,9 @@ import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.{LongType, StructType} -import org.apache.spark.util.ThreadUtils +import org.apache.spark.util.{ThreadUtils, Utils} import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} /** Physical plan for Project. */ @@ -749,7 +750,9 @@ case class SubqueryExec(name: String, child: SparkPlan) private lazy val relationFuture: Future[Array[InternalRow]] = { // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here. val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - Future { + SQLExecution.withThreadLocalCaptured[Array[InternalRow]]( + sqlContext.sparkSession, + SubqueryExec.executionContext) { // This will run in another thread. Set the execution id so that we can connect these jobs // with the correct execution. SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) { @@ -764,7 +767,7 @@ case class SubqueryExec(name: String, child: SparkPlan) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq) rows } - }(SubqueryExec.executionContext) + } } protected override def doCanonicalize(): SparkPlan = { @@ -788,7 +791,8 @@ case class SubqueryExec(name: String, child: SparkPlan) object SubqueryExec { private[execution] val executionContext = ExecutionContext.fromExecutorService( - ThreadUtils.newDaemonCachedThreadPool("subquery", 16)) + ThreadUtils.newDaemonCachedThreadPool("subquery", + SQLConf.get.getConf(StaticSQLConf.SUBQUERY_MAX_THREAD_THRESHOLD))) } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index 776cdb107084d..0cc658c499615 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.internal import org.scalatest.Assertions._ -import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.{SparkException, SparkFunSuite, TaskContext} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LocalRelation @@ -125,6 +125,38 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { val e = intercept[SparkException](dummyQueryExecution1.toRdd.collect()) assert(e.getCause.isInstanceOf[NoSuchElementException]) } + + test("SPARK-30556 propagate local properties to subquery execution thread") { + withSQLConf(StaticSQLConf.SUBQUERY_MAX_THREAD_THRESHOLD.key -> "1") { + withTempView("l", "m", "n") { + Seq(true).toDF().createOrReplaceTempView("l") + val confKey = "spark.sql.y" + + def createDataframe(confKey: String, confValue: String): Dataset[Boolean] = { + Seq(true) + .toDF() + .mapPartitions { _ => + TaskContext.get.getLocalProperty(confKey) == confValue match { + case true => Iterator(true) + case false => Iterator.empty + } + } + } + + // set local configuration and assert + val confValue1 = "e" + createDataframe(confKey, confValue1).createOrReplaceTempView("m") + spark.sparkContext.setLocalProperty(confKey, confValue1) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect.size == 1) + + // change the conf value and assert again + val confValue2 = "f" + createDataframe(confKey, confValue2).createOrReplaceTempView("n") + spark.sparkContext.setLocalProperty(confKey, confValue2) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().size == 1) + } + } + } } case class SQLConfAssertPlan(confToCheck: Seq[(String, String)]) extends LeafExecNode { From d2bca8ff70e6c82e915f633bb9f2f8a4582f7026 Mon Sep 17 00:00:00 2001 From: Tathagata Das Date: Wed, 22 Jan 2020 19:20:25 -0800 Subject: [PATCH 14/17] [SPARK-30609] Allow default merge command resolution to be bypassed by DSv2 tables ### What changes were proposed in this pull request? Skip resolving the merge expressions if the target is a DSv2 table with ACCEPT_ANY_SCHEMA capability. ### Why are the changes needed? Some DSv2 sources may want to customize the merge resolution logic. For example, a table that can accept any schema (TableCapability.ACCEPT_ANY_SCHEMA) may want to allow certain merge queries that are blocked (that is, throws AnalysisError) by the default resolution logic. So there should be a way to completely bypass the merge resolution logic in the Analyzer. ### Does this PR introduce any user-facing change? No, since merge itself is an unreleased feature ### How was this patch tested? added unit test to specifically test the skipping. Closes #27326 from tdas/SPARK-30609. Authored-by: Tathagata Das Signed-off-by: Tathagata Das --- .../sql/catalyst/analysis/Analyzer.scala | 62 +++++++++++-------- .../command/PlanResolutionSuite.scala | 52 +++++++++++++++- 2 files changed, 86 insertions(+), 28 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 7e9f85b64e4a9..503dab1782172 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1326,33 +1326,43 @@ class Analyzer( case m @ MergeIntoTable(targetTable, sourceTable, _, _, _) if !m.resolved && targetTable.resolved && sourceTable.resolved => - val newMatchedActions = m.matchedActions.map { - case DeleteAction(deleteCondition) => - val resolvedDeleteCondition = deleteCondition.map(resolveExpressionTopDown(_, m)) - DeleteAction(resolvedDeleteCondition) - case UpdateAction(updateCondition, assignments) => - val resolvedUpdateCondition = updateCondition.map(resolveExpressionTopDown(_, m)) - // The update value can access columns from both target and source tables. - UpdateAction( - resolvedUpdateCondition, - resolveAssignments(assignments, m, resolveValuesWithSourceOnly = false)) - case o => o - } - val newNotMatchedActions = m.notMatchedActions.map { - case InsertAction(insertCondition, assignments) => - // The insert action is used when not matched, so its condition and value can only - // access columns from the source table. - val resolvedInsertCondition = - insertCondition.map(resolveExpressionTopDown(_, Project(Nil, m.sourceTable))) - InsertAction( - resolvedInsertCondition, - resolveAssignments(assignments, m, resolveValuesWithSourceOnly = true)) - case o => o + + EliminateSubqueryAliases(targetTable) match { + case r: NamedRelation if r.skipSchemaResolution => + // Do not resolve the expression if the target table accepts any schema. + // This allows data sources to customize their own resolution logic using + // custom resolution rules. + m + + case _ => + val newMatchedActions = m.matchedActions.map { + case DeleteAction(deleteCondition) => + val resolvedDeleteCondition = deleteCondition.map(resolveExpressionTopDown(_, m)) + DeleteAction(resolvedDeleteCondition) + case UpdateAction(updateCondition, assignments) => + val resolvedUpdateCondition = updateCondition.map(resolveExpressionTopDown(_, m)) + // The update value can access columns from both target and source tables. + UpdateAction( + resolvedUpdateCondition, + resolveAssignments(assignments, m, resolveValuesWithSourceOnly = false)) + case o => o + } + val newNotMatchedActions = m.notMatchedActions.map { + case InsertAction(insertCondition, assignments) => + // The insert action is used when not matched, so its condition and value can only + // access columns from the source table. + val resolvedInsertCondition = + insertCondition.map(resolveExpressionTopDown(_, Project(Nil, m.sourceTable))) + InsertAction( + resolvedInsertCondition, + resolveAssignments(assignments, m, resolveValuesWithSourceOnly = true)) + case o => o + } + val resolvedMergeCondition = resolveExpressionTopDown(m.mergeCondition, m) + m.copy(mergeCondition = resolvedMergeCondition, + matchedActions = newMatchedActions, + notMatchedActions = newNotMatchedActions) } - val resolvedMergeCondition = resolveExpressionTopDown(m.mergeCondition, m) - m.copy(mergeCondition = resolvedMergeCondition, - matchedActions = newMatchedActions, - notMatchedActions = newNotMatchedActions) case q: LogicalPlan => logTrace(s"Attempting to resolve ${q.simpleString(SQLConf.get.maxToStringFields)}") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 8c73b366fa857..35b2003e952e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.command import java.net.URI -import java.util.Locale +import java.util.{Collections, Locale} import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.{mock, when} @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, E import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, SubqueryAlias, UpdateAction, UpdateTable} import org.apache.spark.sql.connector.InMemoryTableProvider -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, TableCapability, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf @@ -49,6 +49,13 @@ class PlanResolutionSuite extends AnalysisTest { t } + private val tableWithAcceptAnySchemaCapability: Table = { + val t = mock(classOf[Table]) + when(t.schema()).thenReturn(new StructType().add("i", "int")) + when(t.capabilities()).thenReturn(Collections.singleton(TableCapability.ACCEPT_ANY_SCHEMA)) + t + } + private val v1Table: V1Table = { val t = mock(classOf[CatalogTable]) when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) @@ -77,6 +84,7 @@ class PlanResolutionSuite extends AnalysisTest { case "v1Table1" => v1Table case "v2Table" => table case "v2Table1" => table + case "v2TableWithAcceptAnySchemaCapability" => tableWithAcceptAnySchemaCapability case name => throw new NoSuchTableException(name) } }) @@ -1351,5 +1359,45 @@ class PlanResolutionSuite extends AnalysisTest { } } + test("MERGE INTO TABLE - skip resolution on v2 tables that accept any schema") { + val sql = + s""" + |MERGE INTO v2TableWithAcceptAnySchemaCapability AS target + |USING v2Table AS source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED AND (target.s='insert') + | THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + + parseAndResolve(sql) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("source", None), _: DataSourceV2Relation), + EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), + Seq( + DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), + UpdateAction( + Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), + updateAssigns)), + Seq( + InsertAction( + Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), + insertAssigns))) => + assert(l.name == "target.i" && r.name == "source.i") + assert(dl.name == "target.s") + assert(ul.name == "target.s") + assert(il.name == "target.s") + assert(updateAssigns.size == 1) + assert(updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") + assert(updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") + assert(insertAssigns.size == 2) + assert(insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") + assert(insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") + + case l => fail("Expected unresolved MergeIntoTable, but got:\n" + l.treeString) + } + } // TODO: add tests for more commands. } From db528e4fe1907b6bbb1a2e4132427b5c1345710d Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Wed, 22 Jan 2020 22:43:46 -0800 Subject: [PATCH 15/17] [SPARK-30535][SQL] Revert "[] Migrate ALTER TABLE commands to the new framework ### What changes were proposed in this pull request? This reverts commit b5cb9abdd5ee286cc2b0a06cb5f3eac812922a31. ### Why are the changes needed? The merged commit (#27243) was too risky for several reasons: 1. It doesn't fix a bug 2. It makes the resolution of the table that's going to be altered a child. We had avoided this on purpose as having an arbitrary rule change the child of AlterTable seemed risky. This change alone is a big -1 for me for this change. 3. While the code may look cleaner, I think this approach makes certain things harder, e.g. differentiating between the Hive based Alter table CHANGE COLUMN and ALTER COLUMN syntax. Resolving and normalizing columns for ALTER COLUMN also becomes a bit harder, as we now have to check every single AlterTable command instead of just a single ALTER TABLE ALTER COLUMN statement ### Does this PR introduce any user-facing change? No ### How was this patch tested? Existing unit tests This closes #27315 Closes #27327 from brkyvz/revAlter. Authored-by: Burak Yavuz Signed-off-by: Xiao Li --- .../sql/catalyst/analysis/Analyzer.scala | 25 ++- .../sql/catalyst/analysis/CheckAnalysis.scala | 41 ++-- .../catalyst/analysis/ResolveCatalogs.scala | 67 ++++++- .../sql/catalyst/analysis/unresolved.scala | 23 +++ .../catalyst/analysis/v2ResolutionPlans.scala | 14 +- .../sql/catalyst/parser/AstBuilder.scala | 50 ++--- .../catalyst/plans/logical/statements.scala | 56 ++++++ .../catalyst/plans/logical/v2Commands.scala | 138 +++----------- .../sql/connector/catalog/CatalogV2Util.scala | 14 +- .../sql/catalyst/parser/DDLParserSuite.scala | 90 ++++----- .../analysis/ResolveSessionCatalog.scala | 178 +++++++++++++----- .../spark/sql/execution/command/tables.scala | 8 + .../datasources/v2/DataSourceV2Strategy.scala | 14 +- .../sql-tests/results/change-column.sql.out | 4 +- .../sql/connector/DataSourceV2SQLSuite.scala | 2 +- .../spark/sql/execution/SQLViewSuite.scala | 8 +- .../sql/execution/command/DDLSuite.scala | 5 +- .../command/PlanResolutionSuite.scala | 47 ++--- .../sql/hive/execution/HiveCommandSuite.scala | 2 +- 19 files changed, 462 insertions(+), 324 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 503dab1782172..36e558b0dc571 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -755,14 +755,12 @@ class Analyzer( .map(view => i.copy(table = view)) .getOrElse(i) case u @ UnresolvedTable(ident) => - lookupTempView(ident) - .map(_ => UnresolvedTableWithViewExists( - ResolvedView(ident.asIdentifier, isTempView = true))) - .getOrElse(u) + lookupTempView(ident).foreach { _ => + u.failAnalysis(s"${ident.quoted} is a temp view not table.") + } + u case u @ UnresolvedTableOrView(ident) => - lookupTempView(ident) - .map(_ => ResolvedView(ident.asIdentifier, isTempView = true)) - .getOrElse(u) + lookupTempView(ident).map(_ => ResolvedView(ident.asIdentifier)).getOrElse(u) } def lookupTempView(identifier: Seq[String]): Option[LogicalPlan] = { @@ -816,6 +814,14 @@ class Analyzer( lookupV2Relation(u.multipartIdentifier) .map(v2Relation => i.copy(table = v2Relation)) .getOrElse(i) + + case alter @ AlterTable(_, _, u: UnresolvedV2Relation, _) => + CatalogV2Util.loadRelation(u.catalog, u.tableName) + .map(rel => alter.copy(table = rel)) + .getOrElse(alter) + + case u: UnresolvedV2Relation => + CatalogV2Util.loadRelation(u.catalog, u.tableName).getOrElse(u) } /** @@ -882,7 +888,8 @@ class Analyzer( case u @ UnresolvedTable(identifier) => lookupTableOrView(identifier).map { - case v: ResolvedView => UnresolvedTableWithViewExists(v) + case v: ResolvedView => + u.failAnalysis(s"${v.identifier.quoted} is a view not table.") case table => table }.getOrElse(u) @@ -895,7 +902,7 @@ class Analyzer( case SessionCatalogAndIdentifier(catalog, ident) => CatalogV2Util.loadTable(catalog, ident).map { case v1Table: V1Table if v1Table.v1Table.tableType == CatalogTableType.VIEW => - ResolvedView(ident, isTempView = false) + ResolvedView(ident) case table => ResolvedTable(catalog.asTableCatalog, ident, table) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 624c25d95c704..d6fc1dc6ddc3d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.TypeUtils -import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, DeleteColumn, RenameColumn, UpdateColumnComment, UpdateColumnNullability, UpdateColumnPosition, UpdateColumnType} +import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, DeleteColumn, RenameColumn, UpdateColumnComment, UpdateColumnNullability, UpdateColumnType} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -87,20 +87,6 @@ trait CheckAnalysis extends PredicateHelper { } def checkAnalysis(plan: LogicalPlan): Unit = { - // Analysis that needs to be performed top down can be added here. - plan.foreach { - case p if p.analyzed => // Skip already analyzed sub-plans - - case alter: AlterTable => - alter.table match { - case u @ UnresolvedTableWithViewExists(view) if !view.isTempView => - u.failAnalysis("Cannot alter a view with ALTER TABLE. Please use ALTER VIEW instead") - case _ => - } - - case _ => // Analysis successful! - } - // We transform up and order the rules so as to catch the first possible failure instead // of the result of cascading resolution failures. plan.foreachUp { @@ -119,13 +105,23 @@ trait CheckAnalysis extends PredicateHelper { case u: UnresolvedRelation => u.failAnalysis(s"Table or view not found: ${u.multipartIdentifier.quoted}") - case u: UnresolvedTableWithViewExists => - val viewKind = if (u.view.isTempView) { "temp view" } else { "view" } - u.failAnalysis(s"${u.view.identifier.quoted} is a $viewKind not a table.") - case InsertIntoStatement(u: UnresolvedRelation, _, _, _, _) => failAnalysis(s"Table not found: ${u.multipartIdentifier.quoted}") + case u: UnresolvedV2Relation if isView(u.originalNameParts) => + u.failAnalysis( + s"Invalid command: '${u.originalNameParts.quoted}' is a view not a table.") + + case u: UnresolvedV2Relation => + u.failAnalysis(s"Table not found: ${u.originalNameParts.quoted}") + + case AlterTable(_, _, u: UnresolvedV2Relation, _) if isView(u.originalNameParts) => + u.failAnalysis( + s"Invalid command: '${u.originalNameParts.quoted}' is a view not a table.") + + case AlterTable(_, _, u: UnresolvedV2Relation, _) => + failAnalysis(s"Table not found: ${u.originalNameParts.quoted}") + case operator: LogicalPlan => // Check argument data types of higher-order functions downwards first. // If the arguments of the higher-order functions are resolved but the type check fails, @@ -429,9 +425,8 @@ trait CheckAnalysis extends PredicateHelper { case _ => } - case alter: AlterTable - if alter.childrenResolved && alter.table.isInstanceOf[ResolvedTable] => - val table = alter.table.asInstanceOf[ResolvedTable].table + case alter: AlterTable if alter.childrenResolved => + val table = alter.table def findField(operation: String, fieldName: Array[String]): StructField = { // include collections because structs nested in maps and arrays may be altered val field = table.schema.findNestedField(fieldName, includeCollections = true) @@ -484,8 +479,6 @@ trait CheckAnalysis extends PredicateHelper { throw new AnalysisException( s"Cannot change nullable column to non-nullable: $fieldName") } - case update: UpdateColumnPosition => - findField("update", update.fieldNames) case rename: RenameColumn => findField("rename", rename.fieldNames) case update: UpdateColumnComment => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index a44877fc1b4dd..88a3c0a73a10b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, LookupCatalog} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange} /** * Resolves catalogs from the multi-part identifiers in SQL statements, and convert the statements @@ -32,6 +32,71 @@ class ResolveCatalogs(val catalogManager: CatalogManager) import org.apache.spark.sql.connector.catalog.CatalogV2Util._ override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case AlterTableAddColumnsStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), cols) => + val changes = cols.map { col => + TableChange.addColumn( + col.name.toArray, + col.dataType, + col.nullable, + col.comment.orNull, + col.position.orNull) + } + createAlterTable(nameParts, catalog, tbl, changes) + + case a @ AlterTableAlterColumnStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _) => + val colName = a.column.toArray + val typeChange = a.dataType.map { newDataType => + TableChange.updateColumnType(colName, newDataType) + } + val nullabilityChange = a.nullable.map { nullable => + TableChange.updateColumnNullability(colName, nullable) + } + val commentChange = a.comment.map { newComment => + TableChange.updateColumnComment(colName, newComment) + } + val positionChange = a.position.map { newPosition => + TableChange.updateColumnPosition(colName, newPosition) + } + createAlterTable( + nameParts, + catalog, + tbl, + typeChange.toSeq ++ nullabilityChange ++ commentChange ++ positionChange) + + case AlterTableRenameColumnStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), col, newName) => + val changes = Seq(TableChange.renameColumn(col.toArray, newName)) + createAlterTable(nameParts, catalog, tbl, changes) + + case AlterTableDropColumnsStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), cols) => + val changes = cols.map(col => TableChange.deleteColumn(col.toArray)) + createAlterTable(nameParts, catalog, tbl, changes) + + case AlterTableSetPropertiesStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), props) => + val changes = props.map { case (key, value) => + TableChange.setProperty(key, value) + }.toSeq + createAlterTable(nameParts, catalog, tbl, changes) + + // TODO: v2 `UNSET TBLPROPERTIES` should respect the ifExists flag. + case AlterTableUnsetPropertiesStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), keys, _) => + val changes = keys.map(key => TableChange.removeProperty(key)) + createAlterTable(nameParts, catalog, tbl, changes) + + case AlterTableSetLocationStatement( + nameParts @ NonSessionCatalogAndTable(catalog, tbl), partitionSpec, newLoc) => + if (partitionSpec.nonEmpty) { + throw new AnalysisException( + "ALTER TABLE SET LOCATION does not support partition for v2 tables.") + } + val changes = Seq(TableChange.setProperty(TableCatalog.PROP_LOCATION, newLoc)) + createAlterTable(nameParts, catalog, tbl, changes) + case AlterViewSetPropertiesStatement( NonSessionCatalogAndTable(catalog, tbl), props) => throw new AnalysisException( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index c7d977a3d4a82..608f39c2d86fd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.parser.ParserUtils import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} import org.apache.spark.sql.types.{DataType, Metadata, StructType} /** @@ -59,6 +60,28 @@ object UnresolvedRelation { UnresolvedRelation(tableIdentifier.database.toSeq :+ tableIdentifier.table) } +/** + * A variant of [[UnresolvedRelation]] which can only be resolved to a v2 relation + * (`DataSourceV2Relation`), not v1 relation or temp view. + * + * @param originalNameParts the original table identifier name parts before catalog is resolved. + * @param catalog The catalog which the table should be looked up from. + * @param tableName The name of the table to look up. + */ +case class UnresolvedV2Relation( + originalNameParts: Seq[String], + catalog: TableCatalog, + tableName: Identifier) + extends LeafNode with NamedRelation { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + override def name: String = originalNameParts.quoted + + override def output: Seq[Attribute] = Nil + + override lazy val resolved = false +} + /** * An inline table that has not been resolved yet. Once resolved, it is turned by the analyzer into * a [[org.apache.spark.sql.catalyst.plans.logical.LocalRelation]]. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala index c6300b0bb079d..239f987e97a76 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LeafNode +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} import org.apache.spark.sql.connector.catalog.{Identifier, SupportsNamespaces, Table, TableCatalog} /** @@ -41,16 +41,6 @@ case class UnresolvedTable(multipartIdentifier: Seq[String]) extends LeafNode { override def output: Seq[Attribute] = Nil } -/** - * Holds the resolved view. It is used in a scenario where table is expected but the identifier - * is resolved to a (temp) view. - */ -case class UnresolvedTableWithViewExists(view: ResolvedView) extends LeafNode { - override lazy val resolved: Boolean = false - - override def output: Seq[Attribute] = Nil -} - /** * Holds the name of a table or view that has yet to be looked up in a catalog. It will * be resolved to [[ResolvedTable]] or [[ResolvedView]] during analysis. @@ -81,6 +71,6 @@ case class ResolvedTable(catalog: TableCatalog, identifier: Identifier, table: T */ // TODO: create a generic representation for temp view, v1 view and v2 view, after we add view // support to v2 catalog. For now we only need the identifier to fallback to v1 command. -case class ResolvedView(identifier: Identifier, isTempView: Boolean) extends LeafNode { +case class ResolvedView(identifier: Identifier) extends LeafNode { override def output: Seq[Attribute] = Nil } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index a8a96f0f6803a..e1dca4e945397 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2908,7 +2908,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging } /** - * Parse a [[AlterTableAddColumns]] command. + * Parse a [[AlterTableAddColumnsStatement]] command. * * For example: * {{{ @@ -2917,14 +2917,14 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging * }}} */ override def visitAddTableColumns(ctx: AddTableColumnsContext): LogicalPlan = withOrigin(ctx) { - AlterTableAddColumns( - UnresolvedTable(visitMultipartIdentifier(ctx.multipartIdentifier)), + AlterTableAddColumnsStatement( + visitMultipartIdentifier(ctx.multipartIdentifier), ctx.columns.qualifiedColTypeWithPosition.asScala.map(typedVisit[QualifiedColType]) ) } /** - * Parse a [[AlterTableRenameColumn]] command. + * Parse a [[AlterTableRenameColumnStatement]] command. * * For example: * {{{ @@ -2933,14 +2933,14 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging */ override def visitRenameTableColumn( ctx: RenameTableColumnContext): LogicalPlan = withOrigin(ctx) { - AlterTableRenameColumn( - UnresolvedTable(visitMultipartIdentifier(ctx.table)), + AlterTableRenameColumnStatement( + visitMultipartIdentifier(ctx.table), ctx.from.parts.asScala.map(_.getText), ctx.to.getText) } /** - * Parse a [[AlterTableAlterColumn]] command. + * Parse a [[AlterTableAlterColumnStatement]] command. * * For example: * {{{ @@ -2957,8 +2957,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging s"ALTER TABLE table $verb COLUMN requires a TYPE or a COMMENT or a FIRST/AFTER", ctx) } - AlterTableAlterColumn( - UnresolvedTable(visitMultipartIdentifier(ctx.table)), + AlterTableAlterColumnStatement( + visitMultipartIdentifier(ctx.table), typedVisit[Seq[String]](ctx.column), dataType = Option(ctx.dataType).map(typedVisit[DataType]), nullable = None, @@ -2967,7 +2967,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging } /** - * Parse a [[AlterTableAlterColumn]] command to change column nullability. + * Parse a [[AlterTableAlterColumnStatement]] command to change column nullability. * * For example: * {{{ @@ -2981,8 +2981,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case SqlBaseParser.SET => false case SqlBaseParser.DROP => true } - AlterTableAlterColumn( - UnresolvedTable(visitMultipartIdentifier(ctx.table)), + AlterTableAlterColumnStatement( + visitMultipartIdentifier(ctx.table), typedVisit[Seq[String]](ctx.column), dataType = None, nullable = Some(nullable), @@ -2992,7 +2992,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging } /** - * Parse a [[AlterTableAlterColumn]] command. This is Hive SQL syntax. + * Parse a [[AlterTableAlterColumnStatement]] command. This is Hive SQL syntax. * * For example: * {{{ @@ -3015,8 +3015,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging "please run ALTER COLUMN ... SET/DROP NOT NULL instead.") } - AlterTableAlterColumn( - UnresolvedTable(typedVisit[Seq[String]](ctx.table)), + AlterTableAlterColumnStatement( + typedVisit[Seq[String]](ctx.table), columnNameParts, dataType = Option(ctx.colType().dataType()).map(typedVisit[DataType]), nullable = None, @@ -3025,7 +3025,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging } /** - * Parse a [[AlterTableDropColumns]] command. + * Parse a [[AlterTableDropColumnsStatement]] command. * * For example: * {{{ @@ -3036,13 +3036,13 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging override def visitDropTableColumns( ctx: DropTableColumnsContext): LogicalPlan = withOrigin(ctx) { val columnsToDrop = ctx.columns.multipartIdentifier.asScala.map(typedVisit[Seq[String]]) - AlterTableDropColumns( - UnresolvedTable(visitMultipartIdentifier(ctx.multipartIdentifier)), + AlterTableDropColumnsStatement( + visitMultipartIdentifier(ctx.multipartIdentifier), columnsToDrop) } /** - * Parse [[AlterViewSetPropertiesStatement]] or [[AlterTableSetProperties]] commands. + * Parse [[AlterViewSetPropertiesStatement]] or [[AlterTableSetPropertiesStatement]] commands. * * For example: * {{{ @@ -3058,12 +3058,12 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging if (ctx.VIEW != null) { AlterViewSetPropertiesStatement(identifier, cleanedTableProperties) } else { - AlterTableSetProperties(UnresolvedTable(identifier), cleanedTableProperties) + AlterTableSetPropertiesStatement(identifier, cleanedTableProperties) } } /** - * Parse [[AlterViewUnsetPropertiesStatement]] or [[AlterTableUnsetProperties]] commands. + * Parse [[AlterViewUnsetPropertiesStatement]] or [[AlterTableUnsetPropertiesStatement]] commands. * * For example: * {{{ @@ -3081,12 +3081,12 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging if (ctx.VIEW != null) { AlterViewUnsetPropertiesStatement(identifier, cleanedProperties, ifExists) } else { - AlterTableUnsetProperties(UnresolvedTable(identifier), cleanedProperties, ifExists) + AlterTableUnsetPropertiesStatement(identifier, cleanedProperties, ifExists) } } /** - * Create an [[AlterTableSetLocation]] command. + * Create an [[AlterTableSetLocationStatement]] command. * * For example: * {{{ @@ -3094,8 +3094,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging * }}} */ override def visitSetTableLocation(ctx: SetTableLocationContext): LogicalPlan = withOrigin(ctx) { - AlterTableSetLocation( - UnresolvedTable(visitMultipartIdentifier(ctx.multipartIdentifier)), + AlterTableSetLocationStatement( + visitMultipartIdentifier(ctx.multipartIdentifier), Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec), visitLocationSpec(ctx.locationSpec)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 2083a00cae0da..44f7b4143926d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -149,6 +149,62 @@ case class QualifiedColType( comment: Option[String], position: Option[ColumnPosition]) +/** + * ALTER TABLE ... ADD COLUMNS command, as parsed from SQL. + */ +case class AlterTableAddColumnsStatement( + tableName: Seq[String], + columnsToAdd: Seq[QualifiedColType]) extends ParsedStatement + +/** + * ALTER TABLE ... CHANGE COLUMN command, as parsed from SQL. + */ +case class AlterTableAlterColumnStatement( + tableName: Seq[String], + column: Seq[String], + dataType: Option[DataType], + nullable: Option[Boolean], + comment: Option[String], + position: Option[ColumnPosition]) extends ParsedStatement + +/** + * ALTER TABLE ... RENAME COLUMN command, as parsed from SQL. + */ +case class AlterTableRenameColumnStatement( + tableName: Seq[String], + column: Seq[String], + newName: String) extends ParsedStatement + +/** + * ALTER TABLE ... DROP COLUMNS command, as parsed from SQL. + */ +case class AlterTableDropColumnsStatement( + tableName: Seq[String], + columnsToDrop: Seq[Seq[String]]) extends ParsedStatement + +/** + * ALTER TABLE ... SET TBLPROPERTIES command, as parsed from SQL. + */ +case class AlterTableSetPropertiesStatement( + tableName: Seq[String], + properties: Map[String, String]) extends ParsedStatement + +/** + * ALTER TABLE ... UNSET TBLPROPERTIES command, as parsed from SQL. + */ +case class AlterTableUnsetPropertiesStatement( + tableName: Seq[String], + propertyKeys: Seq[String], + ifExists: Boolean) extends ParsedStatement + +/** + * ALTER TABLE ... SET LOCATION command, as parsed from SQL. + */ +case class AlterTableSetLocationStatement( + tableName: Seq[String], + partitionSpec: Option[TablePartitionSpec], + location: String) extends ParsedStatement + /** * ALTER TABLE ... RECOVER PARTITIONS command, as parsed from SQL. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 3e3c81c22b61d..c04e56355a68f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Unevaluable} import org.apache.spark.sql.catalyst.plans.DescribeTableSchema import org.apache.spark.sql.connector.catalog._ -import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition +import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, ColumnChange} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.{DataType, MetadataBuilder, StringType, StructType} @@ -384,125 +384,37 @@ case class DropTable( ifExists: Boolean) extends Command /** - * The base class for ALTER TABLE commands that work for v2 tables. + * The logical plan of the ALTER TABLE command that works for v2 tables. */ -abstract class AlterTable extends Command { - def table: LogicalPlan - - def changes: Seq[TableChange] - - override def children: Seq[LogicalPlan] = Seq(table) +case class AlterTable( + catalog: TableCatalog, + ident: Identifier, + table: NamedRelation, + changes: Seq[TableChange]) extends Command { + + override lazy val resolved: Boolean = table.resolved && { + changes.forall { + case add: AddColumn => + add.fieldNames match { + case Array(_) => + // a top-level field can always be added + true + case _ => + // the parent field must exist + table.schema.findNestedField(add.fieldNames.init, includeCollections = true).isDefined + } - override lazy val resolved: Boolean = table.resolved -} + case colChange: ColumnChange => + // the column that will be changed must exist + table.schema.findNestedField(colChange.fieldNames, includeCollections = true).isDefined -/** - * The logical plan of the ALTER TABLE ... ADD COLUMNS command that works for v2 tables. - */ -case class AlterTableAddColumns( - table: LogicalPlan, - columnsToAdd: Seq[QualifiedColType]) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - columnsToAdd.map { col => - TableChange.addColumn( - col.name.toArray, - col.dataType, - col.nullable, - col.comment.orNull, - col.position.orNull) + case _ => + // property changes require no resolution checks + true } } } -/** - * The logical plan of the ALTER TABLE ... CHANGE COLUMN command that works for v2 tables. - */ -case class AlterTableAlterColumn( - table: LogicalPlan, - column: Seq[String], - dataType: Option[DataType], - nullable: Option[Boolean], - comment: Option[String], - position: Option[ColumnPosition]) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - val colName = column.toArray - val typeChange = dataType.map { newDataType => - TableChange.updateColumnType(colName, newDataType) - } - val nullabilityChange = nullable.map { nullable => - TableChange.updateColumnNullability(colName, nullable) - } - val commentChange = comment.map { newComment => - TableChange.updateColumnComment(colName, newComment) - } - val positionChange = position.map { newPosition => - TableChange.updateColumnPosition(colName, newPosition) - } - typeChange.toSeq ++ nullabilityChange ++ commentChange ++ positionChange - } -} - -/** - * The logical plan of the ALTER TABLE ... RENAME COLUMN command that works for v2 tables. - */ -case class AlterTableRenameColumn( - table: LogicalPlan, - column: Seq[String], - newName: String) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - Seq(TableChange.renameColumn(column.toArray, newName)) - } -} - -/** - * The logical plan of the ALTER TABLE ... DROP COLUMNS command that works for v2 tables. - */ -case class AlterTableDropColumns( - table: LogicalPlan, - columnsToDrop: Seq[Seq[String]]) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - columnsToDrop.map(col => TableChange.deleteColumn(col.toArray)) - } -} - -/** - * The logical plan of the ALTER TABLE ... SET TBLPROPERTIES command that works for v2 tables. - */ -case class AlterTableSetProperties( - table: LogicalPlan, - properties: Map[String, String]) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - properties.map { case (key, value) => - TableChange.setProperty(key, value) - }.toSeq - } -} - -/** - * The logical plan of the ALTER TABLE ... UNSET TBLPROPERTIES command that works for v2 tables. - */ -// TODO: v2 `UNSET TBLPROPERTIES` should respect the ifExists flag. -case class AlterTableUnsetProperties( - table: LogicalPlan, - propertyKeys: Seq[String], - ifExists: Boolean) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - propertyKeys.map(key => TableChange.removeProperty(key)) - } -} - -/** - * The logical plan of the ALTER TABLE ... SET LOCATION command that works for v2 tables. - */ -case class AlterTableSetLocation( - table: LogicalPlan, - partitionSpec: Option[TablePartitionSpec], - location: String) extends AlterTable { - override lazy val changes: Seq[TableChange] = { - Seq(TableChange.setProperty(TableCatalog.PROP_LOCATION, location)) - } -} - /** * The logical plan of the ALTER TABLE RENAME command that works for v2 tables. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index a4c7b4c3a2894..67726c7343524 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -22,7 +22,8 @@ import java.util.Collections import scala.collection.JavaConverters._ -import org.apache.spark.sql.catalyst.analysis.{NamedRelation, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException} +import org.apache.spark.sql.catalyst.analysis.{NamedRelation, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException, UnresolvedV2Relation} +import org.apache.spark.sql.catalyst.plans.logical.AlterTable import org.apache.spark.sql.connector.catalog.TableChange._ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType} @@ -280,6 +281,17 @@ private[sql] object CatalogV2Util { properties ++ Map(TableCatalog.PROP_OWNER -> Utils.getCurrentUserName()) } + def createAlterTable( + originalNameParts: Seq[String], + catalog: CatalogPlugin, + tableName: Seq[String], + changes: Seq[TableChange]): AlterTable = { + val tableCatalog = catalog.asTableCatalog + val ident = tableName.asIdentifier + val unresolved = UnresolvedV2Relation(originalNameParts, tableCatalog, ident) + AlterTable(tableCatalog, ident, unresolved, changes) + } + def getTableProviderCatalog( provider: SupportsCatalogOptions, catalogManager: CatalogManager, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 0bcfccdd8b90a..56d52571d1cc3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -476,22 +476,22 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan(sql1_table), - AlterTableSetProperties( - UnresolvedTable(Seq("table_name")), Map("test" -> "test", "comment" -> "new_comment"))) + AlterTableSetPropertiesStatement( + Seq("table_name"), Map("test" -> "test", "comment" -> "new_comment"))) comparePlans( parsePlan(sql2_table), - AlterTableUnsetProperties( - UnresolvedTable(Seq("table_name")), Seq("comment", "test"), ifExists = false)) + AlterTableUnsetPropertiesStatement( + Seq("table_name"), Seq("comment", "test"), ifExists = false)) comparePlans( parsePlan(sql3_table), - AlterTableUnsetProperties( - UnresolvedTable(Seq("table_name")), Seq("comment", "test"), ifExists = true)) + AlterTableUnsetPropertiesStatement( + Seq("table_name"), Seq("comment", "test"), ifExists = true)) } test("alter table: add column") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x int"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, None) ))) } @@ -499,7 +499,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add multiple columns") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMNS x int, y string"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, None), QualifiedColType(Seq("y"), StringType, true, None, None) ))) @@ -508,7 +508,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with COLUMNS") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMNS x int"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, None) ))) } @@ -516,7 +516,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with COLUMNS (...)") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMNS (x int)"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, None) ))) } @@ -524,7 +524,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with COLUMNS (...) and COMMENT") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMNS (x int COMMENT 'doc')"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, Some("doc"), None) ))) } @@ -532,7 +532,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add non-nullable column") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x int NOT NULL"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, false, None, None) ))) } @@ -540,7 +540,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with COMMENT") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x int COMMENT 'doc'"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, Some("doc"), None) ))) } @@ -548,13 +548,13 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with position") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x int FIRST"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, Some(first())) ))) comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x int AFTER y"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x"), IntegerType, true, None, Some(after("y"))) ))) } @@ -562,7 +562,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add column with nested column name") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x.y.z int COMMENT 'doc'"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x", "y", "z"), IntegerType, true, Some("doc"), None) ))) } @@ -570,7 +570,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: add multiple columns with nested column name") { comparePlans( parsePlan("ALTER TABLE table_name ADD COLUMN x.y.z int COMMENT 'doc', a.b string FIRST"), - AlterTableAddColumns(UnresolvedTable(Seq("table_name")), Seq( + AlterTableAddColumnsStatement(Seq("table_name"), Seq( QualifiedColType(Seq("x", "y", "z"), IntegerType, true, Some("doc"), None), QualifiedColType(Seq("a", "b"), StringType, true, None, Some(first())) ))) @@ -579,12 +579,12 @@ class DDLParserSuite extends AnalysisTest { test("alter table: set location") { comparePlans( parsePlan("ALTER TABLE a.b.c SET LOCATION 'new location'"), - AlterTableSetLocation(UnresolvedTable(Seq("a", "b", "c")), None, "new location")) + AlterTableSetLocationStatement(Seq("a", "b", "c"), None, "new location")) comparePlans( parsePlan("ALTER TABLE a.b.c PARTITION(ds='2017-06-10') SET LOCATION 'new location'"), - AlterTableSetLocation( - UnresolvedTable(Seq("a", "b", "c")), + AlterTableSetLocationStatement( + Seq("a", "b", "c"), Some(Map("ds" -> "2017-06-10")), "new location")) } @@ -592,8 +592,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: rename column") { comparePlans( parsePlan("ALTER TABLE table_name RENAME COLUMN a.b.c TO d"), - AlterTableRenameColumn( - UnresolvedTable(Seq("table_name")), + AlterTableRenameColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), "d")) } @@ -601,8 +601,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: update column type using ALTER") { comparePlans( parsePlan("ALTER TABLE table_name ALTER COLUMN a.b.c TYPE bigint"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(LongType), None, @@ -613,8 +613,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: update column type") { comparePlans( parsePlan("ALTER TABLE table_name CHANGE COLUMN a.b.c TYPE bigint"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(LongType), None, @@ -625,8 +625,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: update column comment") { comparePlans( parsePlan("ALTER TABLE table_name CHANGE COLUMN a.b.c COMMENT 'new comment'"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), None, None, @@ -637,8 +637,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: update column position") { comparePlans( parsePlan("ALTER TABLE table_name CHANGE COLUMN a.b.c FIRST"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), None, None, @@ -650,8 +650,8 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan("ALTER TABLE table_name CHANGE COLUMN a.b.c " + "TYPE bigint COMMENT 'new comment' AFTER d"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(LongType), None, @@ -662,8 +662,8 @@ class DDLParserSuite extends AnalysisTest { test("alter table: SET/DROP NOT NULL") { comparePlans( parsePlan("ALTER TABLE table_name ALTER COLUMN a.b.c SET NOT NULL"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), None, Some(false), @@ -672,8 +672,8 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan("ALTER TABLE table_name ALTER COLUMN a.b.c DROP NOT NULL"), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), None, Some(true), @@ -684,7 +684,7 @@ class DDLParserSuite extends AnalysisTest { test("alter table: drop column") { comparePlans( parsePlan("ALTER TABLE table_name DROP COLUMN a.b.c"), - AlterTableDropColumns(UnresolvedTable(Seq("table_name")), Seq(Seq("a", "b", "c")))) + AlterTableDropColumnsStatement(Seq("table_name"), Seq(Seq("a", "b", "c")))) } test("alter table: drop multiple columns") { @@ -692,8 +692,8 @@ class DDLParserSuite extends AnalysisTest { Seq(sql, sql.replace("COLUMN", "COLUMNS")).foreach { drop => comparePlans( parsePlan(drop), - AlterTableDropColumns( - UnresolvedTable(Seq("table_name")), + AlterTableDropColumnsStatement( + Seq("table_name"), Seq(Seq("x"), Seq("y"), Seq("a", "b", "c")))) } } @@ -705,8 +705,8 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan(sql1), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(IntegerType), None, @@ -715,8 +715,8 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan(sql2), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(IntegerType), None, @@ -725,8 +725,8 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan(sql3), - AlterTableAlterColumn( - UnresolvedTable(Seq("table_name")), + AlterTableAlterColumnStatement( + Seq("table_name"), Seq("a", "b", "c"), Some(IntegerType), None, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 0aaf9d7e2e1ac..8b0d339dbb864 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, Identifier, LookupCatalog, SupportsNamespaces, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, Identifier, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, RefreshTable} @@ -47,63 +47,141 @@ class ResolveSessionCatalog( import org.apache.spark.sql.connector.catalog.CatalogV2Util._ override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { - case AlterTableAddColumns(ResolvedTable(_, ident, _: V1Table), cols) => - cols.foreach { c => - assertTopLevelColumn(c.name, "AlterTableAddColumnsCommand") - if (!c.nullable) { - throw new AnalysisException( - "ADD COLUMN with v1 tables cannot specify NOT NULL.") + case AlterTableAddColumnsStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + cols.foreach { c => + assertTopLevelColumn(c.name, "AlterTableAddColumnsCommand") + if (!c.nullable) { + throw new AnalysisException( + "ADD COLUMN with v1 tables cannot specify NOT NULL.") + } + } + AlterTableAddColumnsCommand(tbl.asTableIdentifier, cols.map(convertToStructField)) + }.getOrElse { + val changes = cols.map { col => + TableChange.addColumn( + col.name.toArray, + col.dataType, + col.nullable, + col.comment.orNull, + col.position.orNull) } + createAlterTable(nameParts, catalog, tbl, changes) } - AlterTableAddColumnsCommand(ident.asTableIdentifier, cols.map(convertToStructField)) - case a @ AlterTableAlterColumn(ResolvedTable(_, ident, _: V1Table), _, _, _, _, _) => - if (a.column.length > 1) { - throw new AnalysisException( - "ALTER COLUMN with qualified column is only supported with v2 tables.") - } - if (a.dataType.isEmpty) { - throw new AnalysisException( - "ALTER COLUMN with v1 tables must specify new data type.") - } - if (a.nullable.isDefined) { - throw new AnalysisException( - "ALTER COLUMN with v1 tables cannot specify NOT NULL.") - } - if (a.position.isDefined) { - throw new AnalysisException("" + - "ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.") - } - val builder = new MetadataBuilder - // Add comment to metadata - a.comment.map(c => builder.putString("comment", c)) - // Add Hive type string to metadata. - val cleanedDataType = HiveStringType.replaceCharType(a.dataType.get) - if (a.dataType.get != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, a.dataType.get.catalogString) + case a @ AlterTableAlterColumnStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), _, _, _, _, _) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + if (a.column.length > 1) { + throw new AnalysisException( + "ALTER COLUMN with qualified column is only supported with v2 tables.") + } + if (a.dataType.isEmpty) { + throw new AnalysisException( + "ALTER COLUMN with v1 tables must specify new data type.") + } + if (a.nullable.isDefined) { + throw new AnalysisException( + "ALTER COLUMN with v1 tables cannot specify NOT NULL.") + } + if (a.position.isDefined) { + throw new AnalysisException("" + + "ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.") + } + val builder = new MetadataBuilder + // Add comment to metadata + a.comment.map(c => builder.putString("comment", c)) + // Add Hive type string to metadata. + val cleanedDataType = HiveStringType.replaceCharType(a.dataType.get) + if (a.dataType.get != cleanedDataType) { + builder.putString(HIVE_TYPE_STRING, a.dataType.get.catalogString) + } + val newColumn = StructField( + a.column(0), + cleanedDataType, + nullable = true, + builder.build()) + AlterTableChangeColumnCommand(tbl.asTableIdentifier, a.column(0), newColumn) + }.getOrElse { + val colName = a.column.toArray + val typeChange = a.dataType.map { newDataType => + TableChange.updateColumnType(colName, newDataType) + } + val nullabilityChange = a.nullable.map { nullable => + TableChange.updateColumnNullability(colName, nullable) + } + val commentChange = a.comment.map { newComment => + TableChange.updateColumnComment(colName, newComment) + } + val positionChange = a.position.map { newPosition => + TableChange.updateColumnPosition(colName, newPosition) + } + createAlterTable( + nameParts, + catalog, + tbl, + typeChange.toSeq ++ nullabilityChange ++ commentChange ++ positionChange) } - val newColumn = StructField( - a.column(0), - cleanedDataType, - nullable = true, - builder.build()) - AlterTableChangeColumnCommand(ident.asTableIdentifier, a.column(0), newColumn) - case AlterTableRenameColumn(ResolvedTable(_, _, _: V1Table), _, _) => - throw new AnalysisException("RENAME COLUMN is only supported with v2 tables.") + case AlterTableRenameColumnStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), col, newName) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + throw new AnalysisException("RENAME COLUMN is only supported with v2 tables.") + }.getOrElse { + val changes = Seq(TableChange.renameColumn(col.toArray, newName)) + createAlterTable(nameParts, catalog, tbl, changes) + } - case AlterTableDropColumns(ResolvedTable(_, _, _: V1Table), _) => - throw new AnalysisException("DROP COLUMN is only supported with v2 tables.") + case AlterTableDropColumnsStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + throw new AnalysisException("DROP COLUMN is only supported with v2 tables.") + }.getOrElse { + val changes = cols.map(col => TableChange.deleteColumn(col.toArray)) + createAlterTable(nameParts, catalog, tbl, changes) + } - case AlterTableSetProperties(ResolvedTable(_, ident, _: V1Table), props) => - AlterTableSetPropertiesCommand(ident.asTableIdentifier, props, isView = false) + case AlterTableSetPropertiesStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), props) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableSetPropertiesCommand(tbl.asTableIdentifier, props, isView = false) + }.getOrElse { + val changes = props.map { case (key, value) => + TableChange.setProperty(key, value) + }.toSeq + createAlterTable(nameParts, catalog, tbl, changes) + } - case AlterTableUnsetProperties(ResolvedTable(_, ident, _: V1Table), keys, ifExists) => - AlterTableUnsetPropertiesCommand(ident.asTableIdentifier, keys, ifExists, isView = false) + case AlterTableUnsetPropertiesStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), keys, ifExists) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableUnsetPropertiesCommand( + tbl.asTableIdentifier, keys, ifExists, isView = false) + }.getOrElse { + val changes = keys.map(key => TableChange.removeProperty(key)) + createAlterTable(nameParts, catalog, tbl, changes) + } - case AlterTableSetLocation( - ResolvedTable(_, ident, _: V1Table), partitionSpec, newLoc) => - AlterTableSetLocationCommand(ident.asTableIdentifier, partitionSpec, newLoc) + case AlterTableSetLocationStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), partitionSpec, newLoc) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableSetLocationCommand(tbl.asTableIdentifier, partitionSpec, newLoc) + }.getOrElse { + if (partitionSpec.nonEmpty) { + throw new AnalysisException( + "ALTER TABLE SET LOCATION does not support partition for v2 tables.") + } + val changes = Seq(TableChange.setProperty(TableCatalog.PROP_LOCATION, newLoc)) + createAlterTable(nameParts, catalog, tbl, changes) + } // ALTER VIEW should always use v1 command if the resolved catalog is session catalog. case AlterViewSetPropertiesStatement(SessionCatalogAndTable(_, tbl), props) => @@ -140,7 +218,7 @@ class ResolveSessionCatalog( DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) // Use v1 command to describe (temp) view, as v2 catalog doesn't support view yet. - case DescribeRelation(ResolvedView(ident, _), partitionSpec, isExtended) => + case DescribeRelation(ResolvedView(ident), partitionSpec, isExtended) => DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) case DescribeColumnStatement( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index ab47d640f4705..a92fbdf25975b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -257,6 +257,14 @@ case class AlterTableAddColumnsCommand( table: TableIdentifier): CatalogTable = { val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table) + if (catalogTable.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( + s""" + |ALTER ADD COLUMNS does not support views. + |You must drop and re-create the views for adding the new columns. Views: $table + """.stripMargin) + } + if (DDLUtils.isDatasourceTable(catalogTable)) { DataSource.lookupDataSource(catalogTable.provider.get, conf). getConstructor().newInstance() match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 7c8fd4e105ca7..448a4354ddd66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -241,18 +241,8 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case DropTable(catalog, ident, ifExists) => DropTableExec(catalog, ident, ifExists) :: Nil - case a @ AlterTableSetLocation(r: ResolvedTable, partitionSpec, _) => - if (partitionSpec.nonEmpty) { - throw new AnalysisException( - "ALTER TABLE SET LOCATION does not support partition for v2 tables.") - } - AlterTableExec(r.catalog, r.identifier, a.changes) :: Nil - - case a: AlterTable => - a.table match { - case r: ResolvedTable => AlterTableExec(r.catalog, r.identifier, a.changes) :: Nil - case _ => Nil - } + case AlterTable(catalog, ident, _, changes) => + AlterTableExec(catalog, ident, changes) :: Nil case RenameTable(catalog, oldIdent, newIdent) => RenameTableExec(catalog, oldIdent, newIdent) :: Nil diff --git a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out index bb5b4ae84d3b7..82326346b361c 100644 --- a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out @@ -195,7 +195,7 @@ ALTER TABLE temp_view CHANGE a TYPE INT COMMENT 'this is column a' struct<> -- !query 20 output org.apache.spark.sql.AnalysisException -temp_view is a temp view not a table.; line 1 pos 0 +Invalid command: 'temp_view' is a view not a table.; line 1 pos 0 -- !query 21 @@ -212,7 +212,7 @@ ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT COMMENT 'this is colu struct<> -- !query 22 output org.apache.spark.sql.AnalysisException -global_temp.global_temp_view is a temp view not a table.; line 1 pos 0 +Invalid command: 'global_temp.global_temp_view' is a view not a table.; line 1 pos 0 -- !query 23 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 75c9bb7be05f4..c19352a2267df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2201,7 +2201,7 @@ class DataSourceV2SQLSuite withTempView("v") { sql("create global temp view v as select 1") val e = intercept[AnalysisException](sql("COMMENT ON TABLE global_temp.v IS NULL")) - assert(e.getMessage.contains("global_temp.v is a temp view not a table.")) + assert(e.getMessage.contains("global_temp.v is a temp view not table.")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 4ed2506b35a80..9a393f19ce9bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -145,13 +145,13 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { // For v2 ALTER TABLE statements, we have better error message saying view is not supported. assertAnalysisError( s"ALTER TABLE $viewName SET LOCATION '/path/to/your/lovely/heart'", - s"$viewName is a temp view not a table") + s"'$viewName' is a view not a table") - // For the following v2 ALERT TABLE statements, relations are first resolved before - // unsupported operations are checked. + // For the following v2 ALERT TABLE statements, unsupported operations are checked first + // before resolving the relations. assertAnalysisError( s"ALTER TABLE $viewName PARTITION (a='4') SET LOCATION '/path/to/home'", - s"$viewName is a temp view not a table") + "ALTER TABLE SET LOCATION does not support partition for v2 tables") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e3fb535ab4cdd..1a9fe46bd6a91 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -2779,7 +2779,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val e = intercept[AnalysisException] { sql("ALTER TABLE tmp_v ADD COLUMNS (c3 INT)") } - assert(e.message.contains("tmp_v is a temp view not a table")) + assert(e.message.contains("'tmp_v' is a view not a table")) } } @@ -2789,8 +2789,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val e = intercept[AnalysisException] { sql("ALTER TABLE v1 ADD COLUMNS (c3 INT)") } - assert(e.message.contains( - "Cannot alter a view with ALTER TABLE. Please use ALTER VIEW instead")) + assert(e.message.contains("ALTER ADD COLUMNS does not support views")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 35b2003e952e1..8f17ce7f32c82 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -26,7 +26,7 @@ import org.mockito.invocation.InvocationOnMock import org.apache.spark.sql.{AnalysisException, SaveMode} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -140,7 +140,6 @@ class PlanResolutionSuite extends AnalysisTest { val rules = Seq( CTESubstitution, analyzer.ResolveRelations, - analyzer.ResolveTables, new ResolveCatalogs(catalogManager), new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v")), analyzer.ResolveTables, @@ -724,24 +723,24 @@ class PlanResolutionSuite extends AnalysisTest { comparePlans(parsed3, expected3) } else { parsed1 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.setProperty("test", "test"), TableChange.setProperty("comment", "new_comment"))) case _ => fail("expect AlterTable") } parsed2 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.removeProperty("comment"), TableChange.removeProperty("test"))) case _ => fail("expect AlterTable") } parsed3 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.removeProperty("comment"), TableChange.removeProperty("test"))) case _ => fail("expect AlterTable") @@ -754,9 +753,15 @@ class PlanResolutionSuite extends AnalysisTest { val parsed4 = parseAndResolve(sql4) val parsed5 = parseAndResolve(sql5) - // For non-existing tables, we expect `UnresolvedTable` in the resolved plan. - assert(parsed4.collect{ case u: UnresolvedTable => u }.length == 1) - assert(parsed5.collect{ case u: UnresolvedTable => u }.length == 1) + // For non-existing tables, we convert it to v2 command with `UnresolvedV2Table` + parsed4 match { + case AlterTable(_, _, _: UnresolvedV2Relation, _) => // OK + case _ => fail("Expect AlterTable, but got:\n" + parsed4.treeString) + } + parsed5 match { + case AlterTable(_, _, _: UnresolvedV2Relation, _) => // OK + case _ => fail("Expect AlterTable, but got:\n" + parsed5.treeString) + } } test("support for other types in TBLPROPERTIES") { @@ -777,8 +782,8 @@ class PlanResolutionSuite extends AnalysisTest { comparePlans(parsed, expected) } else { parsed match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.setProperty("a", "1"), TableChange.setProperty("b", "0.1"), TableChange.setProperty("c", "true"))) @@ -801,8 +806,8 @@ class PlanResolutionSuite extends AnalysisTest { comparePlans(parsed, expected) } else { parsed match { - case a: AlterTable => - assert(a.changes == Seq(TableChange.setProperty("location", "new location"))) + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq(TableChange.setProperty("location", "new location"))) case _ => fail("Expect AlterTable, but got:\n" + parsed.treeString) } } @@ -1043,23 +1048,23 @@ class PlanResolutionSuite extends AnalysisTest { val parsed3 = parseAndResolve(sql3) parsed1 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.updateColumnType(Array("i"), LongType))) case _ => fail("expect AlterTable") } parsed2 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.updateColumnType(Array("i"), LongType), TableChange.updateColumnComment(Array("i"), "new comment"))) case _ => fail("expect AlterTable") } parsed3 match { - case a: AlterTable => - assert(a.changes == Seq( + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( TableChange.updateColumnComment(Array("i"), "new comment"))) case _ => fail("expect AlterTable") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index 9b12ac1d79e73..dbbf2b29fe8b7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -158,7 +158,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val message = intercept[AnalysisException] { sql("SHOW TBLPROPERTIES parquet_temp") }.getMessage - assert(message.contains("parquet_temp is a temp view not a table")) + assert(message.contains("parquet_temp is a temp view not table")) } } From cd9ccdc0aca588699e39b45e708bd87f6622031c Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Thu, 23 Jan 2020 16:00:21 +0900 Subject: [PATCH 16/17] [SPARK-30601][BUILD] Add a Google Maven Central as a primary repository ### What changes were proposed in this pull request? This PR proposes to address four things. Three issues and fixes were a bit mixed so this PR sorts it out. See also http://apache-spark-developers-list.1001551.n3.nabble.com/Adding-Maven-Central-mirror-from-Google-to-the-build-td28728.html for the discussion in the mailing list. 1. Add the Google Maven Central mirror (GCS) as a primary repository. This will not only help development more stable but also in order to make Github Actions build (where it is always required to download jars) stable. In case of Jenkins PR builder, it wouldn't be affected too much as it uses the pre-downloaded jars under `.m2`. - Google Maven Central seems stable for heavy workload but not synced very quickly (e.g., new release is missing) - Maven Central (default) seems less stable but synced quickly. We already added this GCS mirror as a default additional remote repository at SPARK-29175. So I don't see an issue to add it as a repo. https://github.com/apache/spark/blob/abf759a91e01497586b8bb6b7a314dd28fd6cff1/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L2111-L2118 2. Currently, we have the hard-corded repository in [`sbt-pom-reader`](https://github.com/JoshRosen/sbt-pom-reader/blob/v1.0.0-spark/src/main/scala/com/typesafe/sbt/pom/MavenPomResolver.scala#L32) and this seems overwriting Maven's existing resolver by the same ID `central` with `http://` when initially the pom file is ported into SBT instance. This uses `http://` which latently Maven Central disallowed (see https://github.com/apache/spark/pull/27242) My speculation is that we just need to be able to load plugin and let it convert POM to SBT instance with another fallback repo. After that, it _seems_ using `central` with `https` properly. See also https://github.com/apache/spark/pull/27307#issuecomment-576720395. I double checked that we use `https` properly from the SBT build as well: ``` [debug] downloading https://repo1.maven.org/maven2/com/etsy/sbt-checkstyle-plugin_2.10_0.13/3.1.1/sbt-checkstyle-plugin-3.1.1.pom ... [debug] public: downloading https://repo1.maven.org/maven2/com/etsy/sbt-checkstyle-plugin_2.10_0.13/3.1.1/sbt-checkstyle-plugin-3.1.1.pom [debug] public: downloading https://repo1.maven.org/maven2/com/etsy/sbt-checkstyle-plugin_2.10_0.13/3.1.1/sbt-checkstyle-plugin-3.1.1.pom.sha1 ``` This was fixed by adding the same repo (https://github.com/apache/spark/pull/27281), `central_without_mirror`, which is a bit awkward. Instead, this PR adds GCS as a main repo, and community Maven central as a fallback repo. So, presumably the community Maven central repo is used when the plugin is loaded as a fallback. 3. While I am here, I fix another issue. Github Action at https://github.com/apache/spark/pull/27279 is being failed. The reason seems to be scalafmt 1.0.3 is in Maven central but not in GCS. ``` org.apache.maven.plugin.PluginResolutionException: Plugin org.antipathy:mvn-scalafmt_2.12:1.0.3 or one of its dependencies could not be resolved: Could not find artifact org.antipathy:mvn-scalafmt_2.12:jar:1.0.3 in google-maven-central (https://maven-central.storage-download.googleapis.com/repos/central/data/) at org.apache.maven.plugin.internal.DefaultPluginDependenciesResolver.resolve (DefaultPluginDependenciesResolver.java:131) ``` `mvn-scalafmt` exists in Maven central: ```bash $ curl https://repo.maven.apache.org/maven2/org/antipathy/mvn-scalafmt_2.12/1.0.3/mvn-scalafmt_2.12-1.0.3.pom ``` ```xml 4.0.0 ... ``` whereas not in GCS mirror: ```bash $ curl https://maven-central.storage-download.googleapis.com/repos/central/data/org/antipathy/mvn-scalafmt_2.12/1.0.3/mvn-scalafmt_2.12-1.0.3.pom ``` ```xml NoSuchKeyThe specified key does not exist.
No such object: maven-central/repos/central/data/org/antipathy/mvn-scalafmt_2.12/1.0.3/mvn-scalafmt_2.12-1.0.3.pom
% ``` In this PR, simply make both repos accessible by adding to `pluginRepositories`. 4. Remove the workarounds in Github Actions to switch mirrors because now we have same repos in the same order (Google Maven Central first, and Maven Central second) ### Why are the changes needed? To make the build and Github Action more stable. ### Does this PR introduce any user-facing change? No, dev only change. ### How was this patch tested? I roughly checked local and PR against my fork (https://github.com/HyukjinKwon/spark/pull/2 and https://github.com/HyukjinKwon/spark/pull/3). Closes #27307 from HyukjinKwon/SPARK-30572. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- .github/workflows/master.yml | 8 -------- pom.xml | 32 ++++++++++++++++++++++++-------- project/SparkBuild.scala | 3 +++ 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index e4d2d8470cc34..d53119ad75599 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -66,14 +66,6 @@ jobs: export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" mkdir -p ~/.m2 - # `Maven Central` is too flaky in terms of downloading artifacts in `GitHub Action` environment. - # `Google Maven Central Mirror` is too slow in terms of sycing upstream. To get the best combination, - # 1) we set `Google Maven Central` as a mirror of `central` in `GitHub Action` environment only. - # 2) we duplicates `Maven Central` in pom.xml with ID `central_without_mirror`. - # In other words, in GitHub Action environment, `central` is mirrored by `Google Maven Central` first. - # If `Google Maven Central` doesn't provide the artifact due to its slowness, `central_without_mirror` will be used. - # Note that we aim to achieve the above while keeping the existing behavior of non-`GitHub Action` environment unchanged. - echo "google-maven-centralGCS Maven Central mirrorhttps://maven-central.storage-download.googleapis.com/repos/central/data/central" > ~/.m2/settings.xml ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install rm -rf ~/.m2/repository/org/apache/spark diff --git a/pom.xml b/pom.xml index d2b2b0205060f..4e6c571ee052c 100644 --- a/pom.xml +++ b/pom.xml @@ -246,10 +246,13 @@ - central - - Maven Repository - https://repo.maven.apache.org/maven2 + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/repos/central/data/ true @@ -258,12 +261,10 @@ - central_without_mirror + central Maven Repository https://repo.maven.apache.org/maven2 @@ -275,6 +276,21 @@ + + gcs-maven-central-mirror + + GCS Maven Central mirror + https://maven-central.storage-download.googleapis.com/repos/central/data/ + + true + + + false + + central https://repo.maven.apache.org/maven2 diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 9385505ad8ac5..87e584bd28b68 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -224,6 +224,9 @@ object SparkBuild extends PomBuild { // Override SBT's default resolvers: resolvers := Seq( + // Google Mirror of Maven Central, placed first so that it's used instead of flaky Maven Central. + // See https://storage-download.googleapis.com/maven-central/index.html for more info. + "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/repos/central/data/", DefaultMavenRepository, Resolver.mavenLocal, Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) From 2330a5682d376c21b73dbdf5ea10e253941e8cc8 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 23 Jan 2020 16:16:47 +0900 Subject: [PATCH 17/17] [SPARK-30607][SQL][PYSPARK][SPARKR] Add overlay wrappers for SparkR and PySpark ### What changes were proposed in this pull request? This PR adds: - `pyspark.sql.functions.overlay` function to PySpark - `overlay` function to SparkR ### Why are the changes needed? Feature parity. At the moment R and Python users can access this function only using SQL or `expr` / `selectExpr`. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? New unit tests. Closes #27325 from zero323/SPARK-30607. Authored-by: zero323 Signed-off-by: HyukjinKwon --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 39 ++++++++++++++++++++-- R/pkg/R/generics.R | 4 +++ R/pkg/tests/fulltests/test_sparkSQL.R | 2 ++ python/pyspark/sql/functions.py | 34 +++++++++++++++++++ python/pyspark/sql/tests/test_functions.py | 27 +++++++++++++++ 6 files changed, 105 insertions(+), 2 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index f9d9494ca6fa1..7ed2e36d59531 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -335,6 +335,7 @@ exportMethods("%<=>%", "ntile", "otherwise", "over", + "overlay", "percent_rank", "pmod", "posexplode", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index e340b0604e983..af3b18620e0d7 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -136,6 +136,14 @@ NULL #' format to. See 'Details'. #' } #' @param y Column to compute on. +#' @param pos In \itemize{ +#' \item \code{locate}: a start position of search. +#' \item \code{overlay}: a start postiton for replacement. +#' } +#' @param len In \itemize{ +#' \item \code{lpad} the maximum length of each output result. +#' \item \code{overlay} a number of bytes to replace. +#' } #' @param ... additional Columns. #' @name column_string_functions #' @rdname column_string_functions @@ -1319,6 +1327,35 @@ setMethod("negate", column(jc) }) +#' @details +#' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace}, +#' starting from byte position \code{pos} of \code{src} and proceeding for +#' \code{len} bytes. +#' +#' @param replace a Column with replacement. +#' +#' @rdname column_string_functions +#' @aliases overlay overlay,Column-method,numericOrColumn-method +#' @note overlay since 3.0.0 +setMethod("overlay", + signature(x = "Column", replace = "Column", pos = "numericOrColumn"), + function(x, replace, pos, len = -1) { + if (is.numeric(pos)) { + pos <- lit(as.integer(pos)) + } + + if (is.numeric(len)) { + len <- lit(as.integer(len)) + } + + jc <- callJStatic( + "org.apache.spark.sql.functions", "overlay", + x@jc, replace@jc, pos@jc, len@jc + ) + + column(jc) + }) + #' @details #' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string. #' @@ -2819,7 +2856,6 @@ setMethod("window", signature(x = "Column"), #' #' @param substr a character string to be matched. #' @param str a Column where matches are sought for each entry. -#' @param pos start position of search. #' @rdname column_string_functions #' @aliases locate locate,character,Column-method #' @note locate since 1.5.0 @@ -2834,7 +2870,6 @@ setMethod("locate", signature(substr = "character", str = "Column"), #' @details #' \code{lpad}: Left-padded with pad to a length of len. #' -#' @param len maximum length of each output result. #' @param pad a character string to be padded with. #' @rdname column_string_functions #' @aliases lpad lpad,Column,numeric,character-method diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f849dd172247c..4134d5cecc888 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1149,6 +1149,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") }) #' @name NULL setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") }) + #' @rdname column_window_functions #' @name NULL setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index cb47353d600db..d435a8b6d7c4a 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1405,6 +1405,8 @@ test_that("column functions", { trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm") c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) + date_trunc("quarter", c) + current_date() + current_timestamp() + c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) + + overlay(c1, c2, 3, 4) # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 176729eb518e3..7f4cda832972a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1600,6 +1600,40 @@ def instr(str, substr): return Column(sc._jvm.functions.instr(_to_java_column(str), substr)) +@since(3.0) +def overlay(src, replace, pos, len=-1): + """ + Overlay the specified portion of `src` with `replace`, + starting from byte position `pos` of `src` and proceeding for `len` bytes. + + >>> df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y")) + >>> df.select(overlay("x", "y", 7).alias("overlayed")).show() + +----------+ + | overlayed| + +----------+ + |SPARK_CORE| + +----------+ + """ + if not isinstance(pos, (int, str, Column)): + raise TypeError( + "pos should be an integer or a Column / column name, got {}".format(type(pos))) + if len is not None and not isinstance(len, (int, str, Column)): + raise TypeError( + "len should be an integer or a Column / column name, got {}".format(type(len))) + + pos = _create_column_from_literal(pos) if isinstance(pos, int) else _to_java_column(pos) + len = _create_column_from_literal(len) if isinstance(len, int) else _to_java_column(len) + + sc = SparkContext._active_spark_context + + return Column(sc._jvm.functions.overlay( + _to_java_column(src), + _to_java_column(replace), + pos, + len + )) + + @since(1.5) @ignore_unicode_prefix def substring(str, pos, len): diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 36e1e8a660f00..fa9ee57ff5f90 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -310,6 +310,33 @@ def test_input_file_name_udf(self): file_name = df.collect()[0].file self.assertTrue("python/test_support/hello/hello.txt" in file_name) + def test_overlay(self): + from pyspark.sql.functions import col, lit, overlay + from itertools import chain + import re + + actual = list(chain.from_iterable([ + re.findall("(overlay\\(.*\\))", str(x)) for x in [ + overlay(col("foo"), col("bar"), 1), + overlay("x", "y", 3), + overlay(col("x"), col("y"), 1, 3), + overlay("x", "y", 2, 5), + overlay("x", "y", lit(11)), + overlay("x", "y", lit(2), lit(5)), + ] + ])) + + expected = [ + "overlay(foo, bar, 1, -1)", + "overlay(x, y, 3, -1)", + "overlay(x, y, 1, 3)", + "overlay(x, y, 2, 5)", + "overlay(x, y, 11, -1)", + "overlay(x, y, 2, 5)", + ] + + self.assertListEqual(actual, expected) + if __name__ == "__main__": import unittest