-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-47101][SQL] Allow comma to be used in top-level column names and remove check nested type definition in HiveExternalCatalog.verifyDataSchema
#45180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bca98a8
4cc8e1e
4f98793
4b7a3c7
30cabaf
a9061f6
8464a1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,10 +34,10 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} | |
| import org.apache.spark.sql.connector.catalog.CatalogManager | ||
| import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME | ||
| import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER | ||
| import org.apache.spark.sql.errors.DataTypeErrors.toSQLType | ||
| import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec | ||
| import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader} | ||
| import org.apache.spark.sql.functions._ | ||
| import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} | ||
| import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET} | ||
| import org.apache.spark.sql.hive.orc.OrcFileOperator | ||
|
|
@@ -2876,22 +2876,39 @@ class HiveDDLSuite | |
| } | ||
| } | ||
|
|
||
| test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") { | ||
| Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName => | ||
| test("SPARK-47101 checks if nested column names do not include invalid characters") { | ||
| // delimiter characters | ||
| Seq(",", ":").foreach { c => | ||
| val typ = s"array<struct<`abc${c}xyz`:int>>" | ||
| // The regex is from HiveClientImpl.getSparkSQLDataType, please keep them in sync. | ||
| val replaced = typ.replaceAll("`", "").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`") | ||
| withTable("t") { | ||
| checkError( | ||
| exception = intercept[SparkException] { | ||
| sql(s"CREATE TABLE t (a $typ) USING hive") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for parquet tables, do we still have this error?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still fine |
||
| }, | ||
| errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just for my education, where do we throw this error?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. We can reuse the existing error class in this case. |
||
| parameters = Map( | ||
| "fieldType" -> toSQLType(replaced), | ||
| "fieldName" -> "`a`") | ||
| ) | ||
| } | ||
| } | ||
| // other special characters | ||
| Seq(";", "^", "\\", "/", "%").foreach { c => | ||
| val typ = s"array<struct<`abc${c}xyz`:int>>" | ||
| val replaced = typ.replaceAll("`", "") | ||
| val msg = s"java.lang.IllegalArgumentException: Error: : expected at the position " + | ||
| s"16 of '$replaced' but '$c' is found." | ||
| withTable("t") { | ||
| checkError( | ||
| exception = intercept[AnalysisException] { | ||
| spark.range(1) | ||
| .select(struct(lit(0).as(nestedColumnName)).as("toplevel")) | ||
| .write | ||
| .format("hive") | ||
| .saveAsTable("t") | ||
| sql(s"CREATE TABLE t (a $typ) USING hive") | ||
| }, | ||
| errorClass = "INVALID_HIVE_COLUMN_NAME", | ||
| errorClass = "_LEGACY_ERROR_TEMP_3065", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this PR switch from Can we exclude the deletion of
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. INVALID_HIVE_COLUMN_NAME is not necessary anymore. 1) the restrictions for column names have been removed in this PR. 2) Nested field names belong to the data type part. For these two reasons, INVALID_HIVE_COLUMN_NAME could be removed. _LEGACY_ERROR_TEMP_3065 is thrown by |
||
| parameters = Map( | ||
| "invalidChars" -> "',', ':', ';'", | ||
| "tableName" -> "`spark_catalog`.`default`.`t`", | ||
| "columnName" -> s"`$nestedColumnName`") | ||
| "clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException", | ||
| "msg" -> msg) | ||
| ) | ||
| } | ||
| } | ||
|
|
@@ -3385,24 +3402,11 @@ class HiveDDLSuite | |
| } | ||
| } | ||
|
|
||
| test("SPARK-44911: Create the table with invalid column") { | ||
| test("SPARK-47101: comma is allowed in column name") { | ||
| val tbl = "t1" | ||
| withTable(tbl) { | ||
| val e = intercept[AnalysisException] { | ||
| sql( | ||
| s""" | ||
| |CREATE TABLE t1 | ||
| |STORED AS parquet | ||
| |SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10) | ||
| """.stripMargin) | ||
| } | ||
| checkError(e, | ||
| errorClass = "INVALID_HIVE_COLUMN_NAME", | ||
| parameters = Map( | ||
| "invalidChars" -> "','", | ||
| "tableName" -> "`spark_catalog`.`default`.`t1`", | ||
| "columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0, 0`.`000000)`") | ||
| ) | ||
| sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for adding this simpler version.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @dongjoon-hyun, this is changed via request from @cloud-fan #45180 (comment)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, got it~ |
||
| checkAnswer(sql("SELECT * FROM t1"), Row(0)) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this replace rule came from Hive? Can we have a link?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel it's clearer to write the string literal of the replaced value, instead of using this complex regex.